diff --git a/examples/privacy/embedding_inversion/vec2text/README.md b/examples/privacy/embedding_inversion/vec2text/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bf4dcda525d949ff44c6f1b4bc8e13e3b4f0f40d --- /dev/null +++ b/examples/privacy/embedding_inversion/vec2text/README.md @@ -0,0 +1,79 @@ +# vec2text embedding inversion + +## 0.项目相关 + +本仓库主要是对嵌入进行反转,具体的技术细节在这篇文章中**[Text Embeddings Reveal (Almost) As Much As Text (EMNLP 2023)](https://arxiv.org/abs/2310.06816)**. + +主要目的是将论文[相关仓库](https://github.com/jxmorris12/vec2text)的代码迁移到华为mindspore的SIG社区mindarmor上,增强安全性的技术工具。 + +论文主要的内容是: + +**控制生成方法**:作者将其方法描述为一个控制生成问题,目标是生成与给定嵌入尽可能匹配的文本。这涉及生成文本,使得重新嵌入后,它保持接近原始嵌入在潜在空间中的位置。**目的是**构建一个系统,该系统可以采用真实嵌入、假设文本序列和嵌入空间中的假设位置,并预测真实文本序列。 + +**迭代细化**:采用**迭代过程**,其中生成一个初始假设,然后通过一系列步骤进行细化。每一个细化步骤都涉及生成一个新的假设,根据当前假设的嵌入与目标嵌入之间的差异,更好地与目标嵌入对齐。 + +**模型架构**:模型使用为嵌入反演任务适配的基于 Transformer 的encoder-decoder架构。它包含了比较嵌入和调整生成文本的机制。 + +![输入图片说明](img/image.png) +## 1.训练指令 + +第一阶段的目前的环境配置: + +> GPU环境 +> +> python3.9 +> +> mindspore2.2.14+ +> +> mindnlp0.4.0 + +这个仓库中的大部分代码用于训练 inversion models,训练过程大致分为**三个步骤:** + +1. 训练一个“零步”模型,用于从 embeddings 生成文本。 +2. 使用零步模型生成“假设”,作为纠正模型的训练数据。 +3. 训练一个纠正模型,条件是(真实的 embedding,假设,假设 embedding)三元组,以生成纠正后的文本。 + +也就是说代码的训练包括两个部分,第一部分是inversion阶段(step1),第二个部分是corrector阶段(step2,3)。当前仓库支持第一阶段inversion的训练代码的迁移,并正在努力进行第二阶段的迁移。 + +**其中第一阶段的迁移代码的基本的指令如下:** + +> python3 run.py --per_device_train_batch_size 128 --per_device_eval_batch_size 128 --max_seq_length 32 --model_name_or_path **google-t5/t5-base** --dataset_name **nq** --embedder_model_name **gtr_base** --num_repeat_tokens 16 --embedder_no_grad True --num_train_epochs **20** --max_eval_samples **16** --eval_steps 10 --warmup_steps 300 --bf16=1 --use_frozen_embeddings_as_input False --experiment inversion --learning_rate 0.001 --output_dir ./saves/gtr-1 --save_steps 10000000000 --use_less_data **2560** + +接下来分别解释一下以下参数,具体的参数在代码中也有解释,这里主要强调以下**几个参数:** + +google-t5/t5-base是用来inversion的模型,gtr_base是嵌入的模型,nq是一个50多万条文本数据的数据集。 + +use_less_data是使用的其中的训练集的数量,max_eval_samples这是验证集的数量(不要设置很多,因为要评估以及逆转回text,会有点儿慢) + +## 2.一些踩坑点 + +因为本项目原来高度依赖transformer库(huggingface中),但是因为transformer库对mindspore没有支持。 + +所以迁移的技术路线选择的是mindnlp社区的对transformer复现的相关配套API,然而mindnlp社区的transformer的复现的支持不是非常完善,有一个比较严重的问题是:GPU跑相关代码时会发生训练单个step的时间线性增加,这影响了整体的训练效率,对于此bug已提issue,社区负责人也在积极解决中,因此本实验在有限的训练轮次中对比实验结果。 + +当然,一个解决方案是使用Ascend进行训练,因为Ascend卡对于mindspore和mindnlp更加配套,此bug可能不复存在。 + +## 3.实验结果 + +### 1.对比试验 + +分别对nq数据集的25600条数据跑了10个epoch,实验结果如下: + +![输入图片说明](img/comparison.png) + +* 分别做了三组实验,变量是mindspore和pytorch,gtr-base和paraphrase-distilroberta嵌入模型。 + +* 可以看出在同等训练数据和轮次的情况下取得相当的性能,初步证明代码迁移成功。 + +* 后续还需要在申请成功的Ascend资源下进一步验证实验的精度准确性 + +### 2.案例分析 + +![输入图片说明](img/case_study.png) + +* 在mindspore上的训练效果,可以能精确的预测出love,propose,municipality,village等单词,直观上验证我们的迁移结果不错。 + +## 4.后续的工作完善 + +* 在Ascend上解决inner_training_loop的过程中,step线性变慢的问题 +* 继续corrector阶段的复现,增强嵌入反转的效果 \ No newline at end of file diff --git a/examples/privacy/embedding_inversion/vec2text/__init__.py b/examples/privacy/embedding_inversion/vec2text/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6930d8a3755164f8f0acac7c6874400b6c85942d --- /dev/null +++ b/examples/privacy/embedding_inversion/vec2text/__init__.py @@ -0,0 +1,13 @@ +""" +This module initializes the vec2text package, providing access to various components +for embedding inversion tasks, including data collation, model training, and hypothesis generation. +""" +# pylint: disable=W0406 +from . import ( # noqa: F401 + aliases, + collator, + metrics, + models, + trainers, +) +from .trainers import Corrector # noqa: F401 diff --git a/examples/privacy/embedding_inversion/vec2text/aliases.py b/examples/privacy/embedding_inversion/vec2text/aliases.py new file mode 100644 index 0000000000000000000000000000000000000000..f884797aef341e50a1935c4b52c86d6e0cdea008 --- /dev/null +++ b/examples/privacy/embedding_inversion/vec2text/aliases.py @@ -0,0 +1,97 @@ +''' +get zero step model arguments setup for the second phrase of embedding inversion(corrector) +''' +import analyze_utils + +# TODO always load args from disk, delete this dict. +ARGS_DICT = { + "gtr_nq__msl128_beta": ( + "--dataset_name nq " + "--per_device_train_batch_size 128 " + "--per_device_eval_batch_size 128 " + "--max_seq_length 128 " + "--model_name_or_path t5-base " + "--embedder_model_name gtr_base " + "--num_repeat_tokens 16 " + "--embedder_no_grad True " + "--exp_group_name mar17-baselines " + "--learning_rate 0.0003 " + "--freeze_strategy none " + "--embedder_fake_with_zeros False " + "--use_frozen_embeddings_as_input False " + "--num_train_epochs 24 " + "--max_eval_samples 500 " + "--eval_steps 25000 " + "--warmup_steps 100000 " + "--bf16=1 " + "--use_wandb=0" + ), + "paraphrase_nq__msl32__10epoch": ( + "--per_device_train_batch_size 128 " + "--per_device_eval_batch_size 128 " + "--max_seq_length 32 " + "--model_name_or_path google-t5/t5-base " + "--dataset_name nq " + "--embedder_model_name gtr_base " + "--num_repeat_tokens 16 " + "--embedder_no_grad True " + "--num_train_epochs 1 " + "--max_eval_samples 16 " + "--eval_steps 400 " + "--warmup_steps 300 " + "--bf16 1 " + "--use_frozen_embeddings_as_input False " + "--experiment inversion " + "--learning_rate 0.001 " + "--output_dir ./saves/gtr-XXXxxx " + "--save_steps 10000000000 " + "--use_less_data 2560" + ) +} + + +# Dictionary mapping model names +CHECKPOINT_FOLDERS_DICT = { + ############################# MSMARCO ############################## + "paraphrase_nq__msl32__10epoch": "/home/luoyf/vec2text/vec2text/saves/gtr-X", +} + + +def load_experiment_and_trainer_from_alias(alias: str, max_seq_length: int = None, use_less_data: int = None): + """ + Load the experimental setup and corresponding trainer based on a given alias. + + Parameters: + alias (str): The identifier used to select the experiment setup. + max_seq_length (int, optional): The maximum sequence length for the model. Defaults to None. + use_less_data (int, optional): A flag to indicate if a reduced dataset should be used. Defaults to None. + + Returns: + type: Description of the return value (if applicable) + """ + try: + args_str = ARGS_DICT.get(alias) + checkpoint_folder = CHECKPOINT_FOLDERS_DICT[alias] + print("-----------args_str的值是---------------") + print(args_str) + + except KeyError: + print(f"{alias} not found in aliases.py, using as checkpoint folder") + args_str = None + checkpoint_folder = alias + print(f"loading alias {alias} from {checkpoint_folder}...") + experiment, trainer = analyze_utils.load_experiment_and_trainer( + checkpoint_folder, + args_str, + do_eval=False, + max_seq_length=max_seq_length, + use_less_data=use_less_data, + ) + return experiment, trainer + + +def load_model_from_alias(alias: str, max_seq_length: int = None): + _, trainer = load_experiment_and_trainer_from_alias( + alias, max_seq_length=max_seq_length + ) + return trainer.model diff --git a/examples/privacy/embedding_inversion/vec2text/analyze_utils.py b/examples/privacy/embedding_inversion/vec2text/analyze_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..1b6a1965ec2a2a5ab2e784b49be677bdcd236576 --- /dev/null +++ b/examples/privacy/embedding_inversion/vec2text/analyze_utils.py @@ -0,0 +1,203 @@ +''' +load experiment and trainer +''' +import argparse +import os +import glob +import json +import shlex +from typing import Optional + + +import pandas as pd +from mindnlp.engine import get_last_checkpoint +import mindnlp.utils.logging as logging +import mindspore as ms + +import experiments +from models.config import InversionConfig +from run_args import DataArguments, ModelArguments, TrainingArguments, \ + parse_args_into_dataclasses + + +# from mindnlp.accelerate import PartialState +# import error, can't find this package. + +# no need for data transformation across multiple device +# def set_device_context(): +# try: +# # 尝试设置为 GPU +# context.set_context(device_target="GPU") +# print("Using GPU") +# except: +# try: +# # 如果 GPU 不可用,尝试设置为 Ascend +# context.set_context(device_target="Ascend") +# print("Using Ascend") +# except: +# # 如果 Ascend 也不可用,使用 CPU +# context.set_context(device_target="CPU") +# print("Using CPU") +# set_device_context() + +# device = torch.device( +# "cuda" +# if torch.cuda.is_available() +# else "mps" +# if torch.backends.mps.is_available() +# else "cpu" +# ) + +logging.set_verbosity_error() + + +#corrector 的第二个阶段的两次加载都调用这个了 +def load_experiment_and_trainer( + checkpoint_folder: str, + args_str: Optional[str] = None, + checkpoint: Optional[str] = None, + max_seq_length: Optional[int] = None, + use_less_data: Optional[int] = None, +): + ''' + (can't import due to circular import) -> trainers.InversionTrainer: + import previous aliases so that .bin that were saved prior to the + existence of the vec2text module will still work. + ''' + + if checkpoint is None: + checkpoint = get_last_checkpoint(checkpoint_folder) # a checkpoint + if checkpoint is None: + # This happens in a weird case, where no model is saved to saves/xxx/checkpoint-*/pytorch_model.bin + # because checkpointing never happened (likely a very short training run) but there is still a file + # available in saves/xxx/pytorch_model.bin. + checkpoint = checkpoint_folder + print("Loading model from checkpoint:", checkpoint) + + if args_str is not None: + #先后有两次args,第一次是command line中的args,还有一次是调用的写入alias中的明文args + + args_list = shlex.split(args_str) # not namespace format which can be tackled with identical operation like the first call + + parser = argparse.ArgumentParser() + for i in range(0, len(args_list) - 1, 2): + arg_name = args_list[i].lstrip('-') + arg_value = args_list[i + 1] + + try: + arg_value = int(arg_value) + except ValueError: + if arg_value == 'True': + arg_value = True + elif arg_value == 'False': + arg_value = False + parser.add_argument(f'--{arg_name}', default=arg_value, type=type(arg_value)) + + args = parser.parse_args(args_list) + + # traing_args may not be a normal dataclass, and then should be adapted to the new one. + model_args, data_args, training_args = parse_args_into_dataclasses(args) + else: + try: + data_args = ms.load_checkpoint(os.path.join(checkpoint, os.pardir, "data_args.bin")) + except FileNotFoundError: + data_args = ms.load_checkpoint(os.path.join(checkpoint, "data_args.bin")) + try: + model_args = ms.load_checkpoint( + os.path.join(checkpoint, os.pardir, "model_args.bin") + ) + except FileNotFoundError: + model_args = ms.load_checkpoint(os.path.join(checkpoint, "model_args.bin")) + try: + training_args = ms.load_checkpoint( + os.path.join(checkpoint, os.pardir, "training_args.bin") + ) + except FileNotFoundError: + training_args = ms.load_checkpoint(os.path.join(checkpoint, "training_args.bin")) + + training_args.dataloader_num_workers = 0 # no multiprocessing :) + training_args.use_wandb = False + training_args.report_to = [] + training_args.mock_embedder = False + # training_args.no_cuda = not (context.get_context("device_target")=="GPU") + + if max_seq_length is not None: + print( + f"Overwriting max sequence length from {model_args.max_seq_length} to {max_seq_length}" + ) + model_args.max_seq_length = max_seq_length + + if use_less_data is not None: + print( + f"Overwriting use_less_data from {data_args.use_less_data} to {use_less_data}" + ) + data_args.use_less_data = use_less_data + + experiment = experiments.experiment_from_args(model_args, data_args, training_args) + trainer = experiment.load_trainer() + # pylint: disable=W0212 + trainer.model._keys_to_ignore_on_save = [] + try: + # pylint: disable=W0212 + trainer._load_from_checkpoint(checkpoint) + except RuntimeError: + # backwards compatibility from adding/removing layernorm + trainer.model.use_ln = False + trainer.model.layernorm = None + # try again without trying to load layernorm + # pylint: disable=W0212 + trainer._load_from_checkpoint(checkpoint) + return experiment, trainer + + +def load_trainer( + *args, **kwargs +): # (can't import due to circluar import) -> trainers.Inversion + _, trainer = load_experiment_and_trainer(*args, **kwargs) + return trainer + + +def load_results_from_folder(name: str) -> pd.DataFrame: + filenames = glob.glob(os.path.join(name, "*.json")) + data = [] + for f in filenames: + d = json.load(open(f, "r")) + if "_eval_args" in d: + # unnest args for evaluation + d.update(d.pop("_eval_args")) + data.append(d) + return pd.DataFrame(data) + + +def args_from_config(args_cls, config): + args = args_cls() + for key, value in vars(config).items(): + if key in dir(args): + setattr(args, key, value) + return args + + +def load_experiment_and_trainer_from_pretrained(name: str, use_less_data: int = 1000): + '''load experiment and trainer from pretrained model''' + + config = InversionConfig.from_pretrained(name) + model_args = args_from_config(ModelArguments, config) + data_args = args_from_config(DataArguments, config) + training_args = args_from_config(TrainingArguments, config) + + data_args.use_less_data = use_less_data + training_args.bf16 = 0 # no bf16 in case no support from GPU + training_args.local_rank = -1 # Don't load in DDP + + training_args.deepspeed_plugin = None # For backwards compatibility + training_args.use_wandb = False + training_args.report_to = [] + training_args.mock_embedder = False + training_args.output_dir = "saves/" + name.replace("/", "__") + + + experiment = experiments.experiment_from_args(model_args, data_args, training_args) + trainer = experiment.load_trainer() + trainer.model = trainer.model.__class__.from_pretrained(name) + # trainer.model.to(training_args.device) + return experiment, trainer diff --git a/examples/privacy/embedding_inversion/vec2text/collator.py b/examples/privacy/embedding_inversion/vec2text/collator.py new file mode 100644 index 0000000000000000000000000000000000000000..303b18f1c0734b1f4755fbb0b2803d5ae35be44b --- /dev/null +++ b/examples/privacy/embedding_inversion/vec2text/collator.py @@ -0,0 +1,106 @@ +''' +data collator for correction +''' + +from dataclasses import dataclass +from typing import Optional, Union + +import numpy as np +from mindnlp.transformers import PreTrainedTokenizer + + +@dataclass +class DataCollatorForCorrection: + """ + Data collator that will dynamically pad the inputs received, as well as the labels, and hypotheses. + + Based off of hf DataCollatorForSeq2Seq: + github.com/huggingface/transformers/blob/main/src/transformers/data/data_collator.py#L517 + """ + + tokenizer: PreTrainedTokenizer + label_pad_token_id: int = -100 + padding: Union[bool, str] = True + max_length: Optional[int] = None + pad_to_multiple_of: Optional[int] = None + return_tensors: str = "ms" + + def __call__(self, features, return_tensors=None): + if return_tensors is None: + return_tensors = self.return_tensors + labels = ( + [feature["labels"] for feature in features] + if "labels" in features[0].keys() + else None + ) + # We have to pad the labels before calling `tokenizer.pad` as this method won't pad them and needs them of the + # same length to return tensors. + max_label_length = max(len(l) for l in labels) + if self.pad_to_multiple_of is not None: + max_label_length = ( + (max_label_length + self.pad_to_multiple_of - 1) + // self.pad_to_multiple_of + * self.pad_to_multiple_of + ) + + padding_side = self.tokenizer.padding_side + + if "hypothesis_input_ids" in features[0].keys(): + max_hypothesis_length = max( + map(lambda d: len(d["hypothesis_input_ids"]), features) + ) + else: + max_hypothesis_length = 0 + hypothesis_features = [] + regular_features = [] + for feature in features: + ### pad labels + remainder = [self.label_pad_token_id] * ( + max_label_length - len(feature["labels"]) + ) + if isinstance(feature["labels"], list): + feature["labels"] = ( + feature["labels"] + remainder + if padding_side == "right" + else remainder + feature["labels"] + ) + elif padding_side == "right": + feature["labels"] = np.concatenate( + [feature["labels"], remainder] + ).astype(np.int64) + else: + feature["labels"] = np.concatenate( + [remainder, feature["labels"]] + ).astype(np.int64) + #### add to lists + regular_features.append( + {k: v for k, v in feature.items() if not k.startswith("hypothesis_")} + ) + + hypothesis_features.append( + { + k.replace("hypothesis_", ""): v + for k, v in feature.items() + if k.startswith("hypothesis_") + } + ) + + new_features = self.tokenizer.pad( + regular_features, + padding=self.padding, + max_length=self.max_length, + pad_to_multiple_of=self.pad_to_multiple_of, + ) + + if max_hypothesis_length > 0: + hypothesis_features = self.tokenizer.pad( + hypothesis_features, + padding=self.padding, + max_length=self.max_length, + pad_to_multiple_of=self.pad_to_multiple_of, + ) + hypothesis_features = { + f"hypothesis_{k}": v for k, v in hypothesis_features.items() + } + return {**new_features, **hypothesis_features} + return new_features diff --git a/examples/privacy/embedding_inversion/vec2text/data_helpers.py b/examples/privacy/embedding_inversion/vec2text/data_helpers.py new file mode 100644 index 0000000000000000000000000000000000000000..a74165de9b6bf08393490a796fb031b053bd6223 --- /dev/null +++ b/examples/privacy/embedding_inversion/vec2text/data_helpers.py @@ -0,0 +1,106 @@ +''' +download dataset +''' +import os +from typing import Dict, List +import datasets +from mindnlp.dataset import load_dataset +from run_args import DataArguments + + +def retain_dataset_columns(d, allowed_columns: List[str]): + column_names_to_remove = [c for c in d.features if c not in allowed_columns] + return d.remove_columns(column_names_to_remove) + + +def load_nq_dpr_corpus()-> datasets.Dataset: + return load_dataset("jxm/nq_corpus_dpr") + + +def load_msmarco_corpus(): + # has columns ["title", "text"]. only one split ("train") + dataset_dict = load_dataset("Tevatron/msmarco-passage-corpus") + return dataset_dict["train"] + + +def create_omi_ex(ex: Dict[str, str]) -> Dict[str, str]: + ex["text"] = ex["user"] + return ex + + +def create_ompi_ex(ex: Dict[str, str]) -> Dict[str, str]: + ex["user"] = ex["user"].strip() + ex["system"] = ex["system"].strip() + ex["text"] = ex["system"] + "\n\n" + ex["user"] + ex["prefix"] = ex["system"] + "\n\n" + ex["suffix"] = ex["user"] + return ex + + +def get_world_size() -> int: + try: + return os.environ.get("WORLD_SIZE", 1) + except (RuntimeError, ValueError): + return 1 + + + +def dataset_from_args(data_args: DataArguments) -> datasets.DatasetDict: + """Loads a dataset from data_args create in `run_args`.""" + if data_args.dataset_name == "nq": + raw_datasets = load_nq_dpr_corpus() + raw_datasets["validation"] = raw_datasets["dev"] + elif data_args.dataset_name == "msmarco": + raw_datasets = load_msmarco_corpus() + raw_datasets = raw_datasets.train_test_split(test_size=0.01) + raw_datasets["validation"] = raw_datasets["test"] + else: + raise ValueError(f"unsupported dataset {data_args.dataset_name}") + return raw_datasets + + +def load_ag_news_test(): + return load_dataset("ag_news")["test"] + + +def load_xsum_val(col: str): + d = load_dataset("xsum")["validation"] + d = d.rename_column(col, "text") + return d + + +def load_wikibio_val(): + d = load_dataset("wiki_bio")["val"] + d = d.rename_column("target_text", "text") + return d + + +def load_arxiv_val(): + d = load_dataset("ccdv/arxiv-summarization")["validation"] + d = d.rename_column("abstract", "text") + return d + +def load_anthropic_toxic_prompts(): + d = load_dataset("wentingzhao/anthropic-hh-first-prompt")["train"] + d = d.rename_column("user", "text") + return d + +def load_python_code_instructions_18k_alpaca(): + d = load_dataset("iamtarun/python_code_instructions_18k_alpaca")["train"] + d = d.rename_column("instruction", "text") + return d + +def load_standard_val_datasets(): + """Loads a pre-defined set of standard val datasets.""" + d = { + "ag_news": load_ag_news_test(), + "anthropic_toxic_prompts": load_anthropic_toxic_prompts(), + "arxiv": load_arxiv_val(), + "python_code_alpaca": load_python_code_instructions_18k_alpaca(), + # "xsum_doc": load_xsum_val("document"), + # "xsum_summ": load_xsum_val("summary"), + "wiki_bio": load_wikibio_val(), + } + d = {k: retain_dataset_columns(v, ["text"]) for k, v in d.items()} + + return datasets.DatasetDict(d) diff --git a/examples/privacy/embedding_inversion/vec2text/ds_config.json b/examples/privacy/embedding_inversion/vec2text/ds_config.json new file mode 100644 index 0000000000000000000000000000000000000000..36575ec1b01a37c57873a4f026a5182d87da91c5 --- /dev/null +++ b/examples/privacy/embedding_inversion/vec2text/ds_config.json @@ -0,0 +1,29 @@ +{ + "flops_profiler": { + "enabled": false, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null + }, + "train_batch_size": "auto", + "gradient_accumulation_steps": "auto", + "train_micro_batch_size_per_gpu": "auto", + "bf16": { + "enabled": "auto" + }, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": 2, + "offload_param": { + "device": "none" + }, + "offload_optimizer": { + "device": "none" + }, + "allgather_partitions": true, + "allgather_bucket_size": 5e8, + "contiguous_gradients": true + } + } \ No newline at end of file diff --git a/examples/privacy/embedding_inversion/vec2text/experiments.py b/examples/privacy/embedding_inversion/vec2text/experiments.py new file mode 100644 index 0000000000000000000000000000000000000000..efe3a8947b7ce76777a5b4459253bd08f6c12e17 --- /dev/null +++ b/examples/privacy/embedding_inversion/vec2text/experiments.py @@ -0,0 +1,631 @@ +''' +preprocess data and set experimental procedure +''' +import abc +import functools +import hashlib +import json +import os +import resource +from typing import Dict, Optional +import logging + +import numpy as np +import mindspore as ms +from mindspore.dataset import GeneratorDataset +import mindnlp.engine +from mindnlp.transformers.modeling_utils import PreTrainedModel +from mindnlp.transformers import AutoTokenizer +from mindnlp.transformers.tokenization_utils_fast import PreTrainedTokenizer +import datasets # needed by mindnlp + +import aliases +import analyze_utils +from data_helpers import dataset_from_args +from models.config import InversionConfig +from models import CorrectorEncoderModel, InversionModel +from run_args import DataArguments, ModelArguments, TrainingArguments +from tokenize_data import DataCollatorForSeq2Seq, embed_dataset_batch, tokenize_function_, tokenize_function +from utils import dataset_map_single_worker, get_num_proc + + + +# Allow W&B to start slowly. +os.environ["WANDB__SERVICE_WAIT"] = "300" +os.environ["_WANDB_STARTUP_DEBUG"] = "true" +os.environ["TOKENIZERS_PARALLELISM"] = "False" + + +# big issues! occasionally found no access to GPU with ms + + +device = ms.get_context("device_target") +logger = logging.getLogger(__name__) + +# We maintain our own cache because huggingface datasets caching +# doesn't always work properly. +DATASET_CACHE_PATH = os.environ.get( + "VEC2TEXT_CACHE", os.path.expanduser("~/.cache/inversion") +) + + +def md5_hash_kwargs(**kwargs) -> str: + # We ignore special hf args that start with _ like '__cached__setup_devices'. + safe_kwargs = {k: str(v) for k, v in kwargs.items() if not k.startswith("_")} + s = json.dumps(safe_kwargs, sort_keys=True) + return hashlib.md5(s.encode()).hexdigest() + + +class Experiment(abc.ABC): + ''' + experiment base class + ''' + def __init__(self, model_args: ModelArguments, data_args: DataArguments, training_args: TrainingArguments): + # Interactions between args handled here: + training_args.metric_for_best_model = f"{data_args.dataset_name}_loss" + + logger.info( + "Save checkpoints according to metric_for_best_model %s:", + training_args.metric_for_best_model, + ) + + # Save all args. + self.model_args = model_args + self.data_args = data_args + self.training_args = training_args + # Set random seed, add hash to output path. + # transformers.set_seed(training_args.seed) + mindnlp.engine.set_seed(training_args.seed) + + + if training_args.output_dir is None: + training_args.output_dir = os.path.join("saves", self.kwargs_hash) + print(f"Experiment output_dir = {training_args.output_dir}") + # Set up output_dir and wandb. + self._consider_init_wandb() + + @property + def config(self) -> InversionConfig: + return InversionConfig( + **vars(self.data_args), + **vars(self.model_args), + **vars(self.training_args), + ) + + @property + def is_llama_chat(self) -> bool: + return self.model_args.embedder_model_name in [ + "meta-llama/Llama-2-7b-chat-hf", + "meta-llama/Llama-2-13b-chat-hf", + "meta-llama/Llama-2-70b-chat-hf", + ] + + @property + def dataset_kwargs(self) -> Dict[str, str]: + return { + "model_name": self.model_args.model_name_or_path, + "embedder_name": self.model_args.embedder_model_name, + "max_seq_length": str(self.model_args.max_seq_length), + "use_less_data": str(self.data_args.use_less_data), + "embedder_model_api": str(self.model_args.embedder_model_api), + } + + def run(self): + print("----------run start?-------------") + if self.training_args.do_eval: + self.evaluate() + else: + self.train() + + def train(self) -> Dict: + '''training''' + training_args = self.training_args + logger.info("*** Training ***") + training_argsdevice = ms.get_context("device_target") + # Log on each process a small summary of training. + logger.warning( + f"Process rank: {training_args.local_rank}, device: {training_argsdevice}, " + + f"fp16 training: {training_args.fp16}, bf16 training: {training_args.bf16}" + ) + checkpoint = self._get_checkpoint() + + logging.info("Experiment::train() loaded checkpoint %s", checkpoint) + trainer = self.load_trainer() + print(f"train() called – resume-from_checkpoint = {checkpoint}") + train_result = trainer.train(resume_from_checkpoint=checkpoint) + # trainer.save_model() # Saves the tokenizer too for easy upload + metrics = train_result.metrics + print(metrics) + + trainer.log_metrics("train", metrics) + # trainer.save_metrics("train", metrics) + trainer.save_state() + print("success!!!!!great man!!!") + return metrics + + def evaluate(self) -> Dict: + '''Evaluation''' + logger.info("*** Evaluate ***") + trainer = self.load_trainer() + num_eval_samples = len(trainer.eval_dataset) + metrics = trainer.evaluate() + max_eval_samples = ( + self.data_args.max_eval_samples + if self.data_args.max_eval_samples is not None + else num_eval_samples + ) + metrics["eval_samples"] = min(max_eval_samples, num_eval_samples) + trainer.log_metrics("eval", metrics) + trainer.save_metrics("eval", metrics) + return metrics + + def _get_checkpoint(self) -> Optional[str]: + '''get checkpoint to implement the correction''' + training_args = self.training_args + last_checkpoint = None + if (os.path.isdir(training_args.output_dir) and not training_args.overwrite_output_dir): + last_checkpoint = mindnlp.engine.get_last_checkpoint( + training_args.output_dir + ) + if (last_checkpoint is None and os.listdir(training_args.output_dir)): + raise ValueError( + f"Output directory ({training_args.output_dir}) already exists and is not empty. " + "Use --overwrite_output_dir to overcome." + ) + if (last_checkpoint is not None and training_args.resume_from_checkpoint is None): + logger.info( + "Checkpoint detected, resuming training at %s. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch.", + last_checkpoint + ) + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: + checkpoint = last_checkpoint + + if checkpoint: + logger.info("Loading from checkpoint %s", checkpoint) + else: + logger.info("No checkpoint found, training from scratch") + + print(checkpoint) + print(last_checkpoint) + return checkpoint + + @property + def kwargs_hash(self) -> str: + all_args = { + **vars(self.model_args), + **vars(self.data_args), + **vars(self.training_args), + } + all_args.pop("local_rank") + # print("all_args:", all_args) + return md5_hash_kwargs(**all_args) + + @property + def _world_size(self) -> int: + #not found in mindspore similar with torch.distributed.get_world_size() + #TODO: add some distribution function to it + try: + return os.environ.get("WORLD_SIZE", 1) + except (RuntimeError, ValueError): + return 1 + + @property + def _is_main_worker(self) -> bool: + return (self.training_args.local_rank <= 0) and ( + int(os.environ.get("LOCAL_RANK", 0)) <= 0 + ) + + @property + @abc.abstractmethod + def _wandb_project_name(self) -> str: + raise NotImplementedError() + + @property + def _wandb_exp_name(self) -> str: + name_args = [ + self.training_args.exp_group_name, + self.training_args.exp_name, + self.model_args.model_name_or_path, + self.model_args.embedder_model_name, + ] + name_args = [n for n in name_args if ((n is not None) and len(n))] + return "__".join(name_args) + + def _consider_init_wandb(self) -> None: + '''whether to init wandb''' + if self.training_args.use_wandb and self._is_main_worker: + import wandb + + wandb.init( + project=self._wandb_project_name, + name=self._wandb_exp_name, + id=self.kwargs_hash, + resume=True, + ) + training_args = vars(self.training_args) + # deepspeed kwargs are not json serializable + training_args = { + k: v for k, v in training_args.items() if "deepspeed" not in k + } + wandb.config.update( + { + **vars(self.model_args), + **vars(self.data_args), + **training_args, + }, + allow_val_change=True, + ) + # Long-running experiments have been killed because wandb + # runs out of file descriptors to write summary files + # to. Very silly error, but seems unfixed: + # https://github.com/wandb/wandb/issues/2825 + # + # Anyway, this line of code should (hopefully) set the + # limit to infinity so this can't happen. + resource.setrlimit( + resource.RLIMIT_CORE, (resource.RLIM_INFINITY, resource.RLIM_INFINITY) + ) + else: + # Disable W&B + pass + # os.environ["WANDB_MODE"] = "disabled" + # os.environ["WANDB_DISABLED"] = "true" + + @abc.abstractmethod + def load_trainer(self) -> mindnlp.engine.Trainer: + raise NotImplementedError() + + @abc.abstractmethod + def load_model(self) -> PreTrainedModel: + raise NotImplementedError() + + def load_tokenizer(self) -> PreTrainedTokenizer: + '''load tokenizer''' + tokenizer = AutoTokenizer.from_pretrained( + self.model_args.model_name_or_path, + padding="max_length", + truncation="max_length", + max_length=self.model_args.max_seq_length, + ) + + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + + # Disable super annoying warning: + # https://github.com/huggingface/transformers/issues/22638 + tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True + return tokenizer + + #lack API(transformers.DataCollatorForSeq2Seq) in mindnlp and achieve it from scratch + def get_collator(self, tokenizer: PreTrainedTokenizer) -> DataCollatorForSeq2Seq: + return DataCollatorForSeq2Seq( + tokenizer, + model=None, + label_pad_token_id=-100, + padding="max_length", + max_length=self.model_args.max_seq_length, + pad_to_multiple_of=8 if self.training_args.fp16 else None, + ) + + def _load_train_dataset_uncached(self, tokenizer: AutoTokenizer, embedder_tokenizer: AutoTokenizer): + ''' + load train dataset uncached + ''' + data_args = self.data_args + # Load datasets + + logger.info("Loading dataset '%s'...", self.data_args.dataset_name) + raw_datasets = dataset_from_args(self.data_args) + + + # Remove extra features except for 'frozen_embeddings' which could be embeddings + # saved to disk. + # column_names = list(raw_datasets["train"].features) + + column_names = raw_datasets["train"].column_names + + # pylint: disable=C0103 + ALLOWED_COLUMN_NAMES = {"frozen_embeddings"} + column_names = [c for c in column_names if c not in ALLOWED_COLUMN_NAMES] + if data_args.use_less_data and (data_args.use_less_data > 0): + new_length = min(len(raw_datasets["train"]), data_args.use_less_data) + train_datasets = raw_datasets["train"].take(new_length) + new_length_ = min(len(raw_datasets["validation"]), data_args.max_eval_samples) + eval_datasets = raw_datasets["validation"].take(new_length_) + + + + print( + ">> using fast tokenizers:", tokenizer.is_fast, embedder_tokenizer.is_fast + ) + + + train_datasets = train_datasets.map(tokenize_function(tokenizer, embedder_tokenizer, + self.model_args.max_seq_length, padding=False), + num_parallel_workers=8) + + #no more proc, some mistakes + eval_datasets = eval_datasets.map(tokenize_function_(tokenizer, embedder_tokenizer, + self.model_args.max_seq_length, padding=False),) + + #index_ds = ds.NumpySlicesDataset(np.array(range(train_datasets.get_dataset_size())),\ + # column_names=['idx']) + + #------------------------------val process-------------------------------- + max_eval_samples = min( + self.data_args.use_less_data, self.data_args.max_eval_samples + ) + eval_datasets = eval_datasets.take(max_eval_samples) + #index_ds_ = ds.NumpySlicesDataset(list(range(max_eval_samples)), column_names=['idx']) + return train_datasets, eval_datasets + + def _prepare_val_datasets_dict(self, model: PreTrainedModel, tokenizer: AutoTokenizer, + embedder_tokenizer: AutoTokenizer, val_datasets_dict: datasets.DatasetDict): + '''prepare_val_datasets_dict''' + for name, dataset in val_datasets_dict.items(): + max_eval_samples = min(len(dataset), self.data_args.max_eval_samples) + val_datasets_dict[name] = val_datasets_dict[name].select( + range(max_eval_samples) + ) + val_datasets_dict[name] = val_datasets_dict[name].add_column( + "idx", range(len(val_datasets_dict[name])) + ) + val_datasets_dict[name].set_format("ms") + + tokenize_fn = tokenize_function + + for key in val_datasets_dict: + val_datasets_dict[key] = dataset_map_single_worker( + dataset=val_datasets_dict[key], + map_fn=tokenize_fn( + tokenizer=tokenizer, + embedder_tokenizer=embedder_tokenizer, + text_column_name="text", + max_seq_length=self.model_args.max_seq_length, + padding=False, + ), + remove_columns=["text"], + batched=True, + batch_size=1024, + num_proc=get_num_proc(), + desc="Running tokenizer on dataset", + ) + + # filter out empty examples (these exist for xsum documents). + val_datasets_dict = val_datasets_dict.filter(lambda ex: ex["length"] > 1) + + if self.model_args.use_frozen_embeddings_as_input: + # assert torch.cuda.is_available() + # model = model.to(device) + + new_tokenized_datasets = {} + for key, d in val_datasets_dict.items(): + new_tokenized_datasets[key] = dataset_map_single_worker( + dataset=d, + map_fn=functools.partial(embed_dataset_batch, model), + batched=True, + batch_size=self.training_args.per_device_train_batch_size, + # pylint: disable=W0212 + new_fingerprint=( + d._fingerprint + md5_hash_kwargs(**self.dataset_kwargs) + "" + ), + num_proc=1, + ) + val_datasets_dict = datasets.DatasetDict(new_tokenized_datasets) + return val_datasets_dict + + def load_train_and_val_datasets(self, tokenizer: AutoTokenizer, + embedder_tokenizer: AutoTokenizer): + '''load_train_and_val_datasets''' + dataset_kwargs: Dict[str, str] = self.dataset_kwargs + + # Only set this if it's true, for backwards-compatibility with + # when we forgot to cache using this argument. + if self.model_args.use_frozen_embeddings_as_input: + dataset_kwargs["use_frozen_embeddings_as_input"] = "True" + # Deprecated arg below. We used to cache different + # embeddings for suffixes. Then they became the same. + # Removing the below line will invalidate other + # people's caches. + dataset_kwargs["suffix_conditioning"] = "False" + + # os.environ["TOKENIZERS_PARALLELISM"] = "True" + print( + "Loading datasets with TOKENIZERS_PARALLELISM =", + os.environ.get("TOKENIZERS_PARALLELISM"), + ) + ###################################################################### + train_dataset_kwargs = { + "dataset_name": self.data_args.dataset_name, + **dataset_kwargs, + } + train_dataset_path = os.path.join( + DATASET_CACHE_PATH, (md5_hash_kwargs(**train_dataset_kwargs) + ".npy") + ) + # Optionally set a train dataset path override + train_dataset_path = os.environ.get( + "VEC2TEXT_TRAIN_DATASET_PATH", train_dataset_path + ) + if os.path.exists(train_dataset_path): + print("path?", train_dataset_path) + print("loading train dataset from path:", train_dataset_path) + train_datasets = datasets.load_from_disk(train_dataset_path) + else: + train_datasets, eval_datasets = self._load_train_dataset_uncached( + tokenizer=tokenizer, + embedder_tokenizer=embedder_tokenizer, + ) + + #-------------------------------------------- + # i = 0 + # data_list = [] + # for data in train_datasets.create_dict_iterator(): + # i += 1 + # data = data['text'] + # data_list.append(data) + # if (i == self.data_args.use_less_data): + # break + # column_names = ['input_ids', 'attention_mask', 'labels', 'length', 'embedder_input_ids', + # 'embedder_attention_mask'] + # + # def data_generator(): + # for data in data_list: + # yield ( + # data['input_ids'], data['attention_mask'], data['labels'], data['length'][0], + # data['embedder_input_ids'], + # data['embedder_attention_mask']) + # + # train_datasets = GeneratorDataset(data_generator, column_names) + # -------------------------------------------- + column_names = ['input_ids', 'attention_mask', 'labels', 'length', 'embedder_input_ids', + 'embedder_attention_mask'] + + # create numpy memmap in order to lazy download, but no use, so disgusting bug! + # filename = '/home/luoyf/vec2text/vec2text/saves/train_dataset/processed_data_' +\ + #str(self.data_args.use_less_data) + '.dat' + data_list = [] + u = -1 + # store in memmap + for data in train_datasets: + u += 1 + input_ids = data[0]['input_ids'].asnumpy() + attention_mask = data[0]['attention_mask'].asnumpy() + labels = data[0]['labels'].asnumpy() + length = data[0]['length'][0].asnumpy()#为了存储方便扩展成32位 + embedder_input_ids = data[0]['embedder_input_ids'].asnumpy() + embedder_attention_mask = data[0]['embedder_attention_mask'].asnumpy() + # idx = np.full(32, data[1].asnumpy()) + combined_array = [input_ids, attention_mask, labels, length, embedder_input_ids, embedder_attention_mask] + # data_memmap[u] = combined_array + data_list.append(combined_array) + if u == self.data_args.use_less_data - 1: + break + for i in range(1): + print(data_list[i]) + print("训练数据格式如上↑") + + def data_generator(): + for i in range(self.data_args.use_less_data): + yield ( + ms.Tensor(data_list[i][0].astype(np.int32)), + ms.Tensor(data_list[i][1].astype(np.int32)), + ms.Tensor(data_list[i][2].astype(np.int32)), + ms.Tensor(data_list[i][3].astype(np.int32)), + ms.Tensor(data_list[i][4].astype(np.int32)), + ms.Tensor(data_list[i][5].astype(np.int32)), + ) + train_datasets = GeneratorDataset(data_generator, column_names) + + data_list_ = [] + for data in eval_datasets.create_dict_iterator(): + data = data['text'] + data_list_.append(data) + column_names_ = ['input_ids', 'attention_mask', 'labels', 'length', 'embedder_input_ids', + 'embedder_attention_mask'] + + def data_generator_(): + for data in data_list_: + yield ( + data['input_ids'], data['attention_mask'], data['labels'], data['length'][0], + data['embedder_input_ids'], data['embedder_attention_mask']) + + eval_datasets = GeneratorDataset(data_generator_, column_names_) + print("convert success!") + + return (train_datasets, eval_datasets) + + +class InversionExperiment(Experiment): + ''' + inversion experiment + ''' + @property + def trainer_cls(self): + return trainers.InversionTrainer + + @property + def _wandb_project_name(self) -> str: + return "emb-inv-4" + + def load_model(self) -> PreTrainedModel: + return InversionModel( + config=self.config, + ) + # convert MapDataset with "text" key to GeneratorDataset without it + + + def load_trainer(self) -> mindnlp.engine.Trainer: + model = self.load_model() + train_dataset, eval_dataset = self.load_train_and_val_datasets( + tokenizer=model.tokenizer, + embedder_tokenizer=model.embedder_tokenizer, + ) + return self.trainer_cls( + model=model, + args=self.training_args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + # data_collator=self.get_collator(tokenizer=model.tokenizer), + ) + + +class CorrectorExperiment(Experiment): + ''' + corrector experiment + ''' + @property + def _wandb_project_name(self) -> str: + return "emb-correct-1" + + def load_trainer(self) -> mindnlp.engine.Trainer: + if self.training_args.corrector_model_from_pretrained: + ( + _, + inversion_trainer, + ) = analyze_utils.load_experiment_and_trainer_from_pretrained( + name=self.training_args.corrector_model_from_pretrained, + # max_seq_length=self.model_args.max_seq_length, + use_less_data=self.data_args.use_less_data, + ) + else: + ( + _, + inversion_trainer, + ) = aliases.load_experiment_and_trainer_from_alias( + alias=self.training_args.corrector_model_alias, + max_seq_length=self.model_args.max_seq_length, + use_less_data=self.data_args.use_less_data, + ) + model = self.load_model() + return trainers.Corrector( + model=model, + inversion_trainer=inversion_trainer, + args=self.training_args, + # data_collator=DataCollatorForCorrection( + # tokenizer=inversion_trainer.model.tokenizer + # ), + ) + + def load_model(self) -> PreTrainedModel: + return CorrectorEncoderModel( + config=self.config, + ) + + +EXPERIMENT_CLS_MAP = { + "inversion": InversionExperiment, + "corrector": CorrectorExperiment, + "corrector_encoder": CorrectorExperiment, # backwards-compatible; does same thing as just 'corrector' +} + + +def experiment_from_args(model_args, data_args, training_args) -> Experiment: + if training_args.experiment in EXPERIMENT_CLS_MAP: + experiment_cls = EXPERIMENT_CLS_MAP[training_args.experiment] # type: ignore + else: + raise ValueError(f"Unknown experiment {training_args.experiment}") + return experiment_cls(model_args, data_args, training_args) # type: ignore diff --git a/examples/privacy/embedding_inversion/vec2text/img/.keep b/examples/privacy/embedding_inversion/vec2text/img/.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/examples/privacy/embedding_inversion/vec2text/img/case_study.png b/examples/privacy/embedding_inversion/vec2text/img/case_study.png new file mode 100644 index 0000000000000000000000000000000000000000..9cf17ca646016fa8343a27bd1ff055043d36305b Binary files /dev/null and b/examples/privacy/embedding_inversion/vec2text/img/case_study.png differ diff --git a/examples/privacy/embedding_inversion/vec2text/img/comparison.png b/examples/privacy/embedding_inversion/vec2text/img/comparison.png new file mode 100644 index 0000000000000000000000000000000000000000..c66ebb5869f934e7c5c89d4482e8424e679dc89e Binary files /dev/null and b/examples/privacy/embedding_inversion/vec2text/img/comparison.png differ diff --git a/examples/privacy/embedding_inversion/vec2text/img/image.png b/examples/privacy/embedding_inversion/vec2text/img/image.png new file mode 100644 index 0000000000000000000000000000000000000000..38568826beb018d04a7a2dc46af4a795ddee5bbe Binary files /dev/null and b/examples/privacy/embedding_inversion/vec2text/img/image.png differ diff --git a/examples/privacy/embedding_inversion/vec2text/models/__init__.py b/examples/privacy/embedding_inversion/vec2text/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..737fca8919177d25ef5e50f2c4dc624dbfdc2661 --- /dev/null +++ b/examples/privacy/embedding_inversion/vec2text/models/__init__.py @@ -0,0 +1,13 @@ +''' +models init +''' +#pylint: disable=E0001 +from .corrector_encoder import CorrectorEncoderModel +from .inversion import InversionModel # noqa: F401 +from .model_utils import ( # noqa: F401 + EMBEDDER_MODEL_NAMES, + EMBEDDING_TRANSFORM_STRATEGIES, + FREEZE_STRATEGIES, + load_embedder_and_tokenizer, + load_encoder_decoder, +) diff --git a/examples/privacy/embedding_inversion/vec2text/models/config.py b/examples/privacy/embedding_inversion/vec2text/models/config.py new file mode 100644 index 0000000000000000000000000000000000000000..eb7d48c664cddfb9b5db66902af13c0f8743279b --- /dev/null +++ b/examples/privacy/embedding_inversion/vec2text/models/config.py @@ -0,0 +1,38 @@ +''' +dummy configuration class +''' +import json + +from mindnlp.transformers import PretrainedConfig + +NEW_ATTRIBUTES = { + "embedder_torch_dtype": "float32", +} + + +class InversionConfig(PretrainedConfig): + """We create a dummy configuration class that will just set properties + based on whatever kwargs we pass in. + + When this class is initialized (see experiments.py) we pass in the + union of all data, model, and training args, all of which should + get saved to the config json. + """ + + def __init__(self, **kwargs): + for key, value in kwargs.items(): + try: + json.dumps(value) + setattr(self, key, value) + except TypeError: + # value was not JSON-serializable, skip + continue + super().__init__() + + def __getattribute__(self, key): + try: + return super().__getattribute__(key) + except AttributeError as e: + if key in NEW_ATTRIBUTES: + return NEW_ATTRIBUTES[key] + raise e diff --git a/examples/privacy/embedding_inversion/vec2text/models/corrector_encoder.py b/examples/privacy/embedding_inversion/vec2text/models/corrector_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..4ef2bc22fbb9fca927f93d461b3bd9286660e4bf --- /dev/null +++ b/examples/privacy/embedding_inversion/vec2text/models/corrector_encoder.py @@ -0,0 +1,214 @@ +''' +model to inverse embedding with the inversion model +''' + +import copy +from typing import Dict, Optional + +import mindspore as ms +import mindspore.ops as ops +from mindnlp.transformers import PreTrainedModel, AutoModelForSeq2SeqLM + +from .config import InversionConfig + + +class CorrectorEncoderModel(PreTrainedModel): + """Embeds text and concats with a provided embedding. + + TODO improve comment here. + """ + + config_class = InversionConfig + encoder_decoder: PreTrainedModel + + def __init__(self, config: InversionConfig,): + super().__init__(config=config) + if config.embedder_model_api: + embedder_dim = 1536 + else: + embedder_dim = 768 + bottleneck_dim = embedder_dim + + num_repeat_tokens = config.num_repeat_tokens + ignore_hypothesis_embedding = config.corrector_ignore_hypothesis_embedding + self.use_ff_dropout = False + + encoder_decoder = AutoModelForSeq2SeqLM.from_pretrained( + config.model_name_or_path + ) + self.encoder_decoder = encoder_decoder # .to_bettertransformer() + self.embedder_dim = embedder_dim + self.num_repeat_tokens = num_repeat_tokens + self.encoder_hidden_dim = self.encoder_decoder.config.hidden_size + self.embedding_transform_1 = ms.nn.SequentialCell( + ms.nn.Dense(self.embedder_dim, bottleneck_dim), + # ms.nn.Dropout( + # self.encoder_decoder.config.dropout_rate if self.use_ff_dropout else 0.0 + # ), + ms.nn.GELU(), + ms.nn.Dense(bottleneck_dim, self.encoder_hidden_dim * num_repeat_tokens), + ) + self.embedding_transform_2 = ms.nn.SequentialCell( + ms.nn.Dense(self.embedder_dim, bottleneck_dim), + # ms.nn.Dropout( + # self.encoder_decoder.config.dropout_rate if self.use_ff_dropout else 0.0 + # ), + ms.nn.GELU(), + ms.nn.Dense(bottleneck_dim, self.encoder_hidden_dim * num_repeat_tokens), + ) + self.embedding_transform_3 = ms.nn.SequentialCell( + ms.nn.Dense(self.embedder_dim, bottleneck_dim), + # ms.nn.Dropout( + # self.encoder_decoder.config.dropout_rate if self.use_ff_dropout else 0.0 + # ), + ms.nn.GELU(), + ms.nn.Dense(bottleneck_dim, self.encoder_hidden_dim * num_repeat_tokens), + ) + self.ignore_hypothesis_embedding = ignore_hypothesis_embedding + # TODO argparse; default to 0? + self.training_embedding_noise_level = 0 + # self.training_embedding_noise_level = 1e-5 # adding for openai... + self.use_ln = True + if self.use_ln: + self.layernorm = ms.nn.LayerNorm([self.encoder_hidden_dim]) + # print(f"Corrector encoder noise level {self.training_embedding_noise_level}") + + def get_encoder_embedding(self, embedding: ms.Tensor, hypothesis_embedding: ms.Tensor, + hypothesis_input_ids: ms.Tensor, hypothesis_attention_mask: ms.Tensor): + ''' + get encoder embedding + ''' + + batch_size, _ = embedding.shape + assert embedding.shape == (batch_size, self.embedder_dim) + assert hypothesis_embedding.shape == (batch_size, self.embedder_dim) + + if (self.training) and (self.training_embedding_noise_level > 0): + embedding += self.training_embedding_noise_level * ops.randn( + embedding.shape + ) + hypothesis_embedding += self.training_embedding_noise_levelA * ops.randn( + hypothesis_embedding.shape + ) + + if self.ignore_hypothesis_embedding: + # For "No Feedback" ablation + hypothesis_embedding = embedding + + diff_embedding = embedding - hypothesis_embedding + + embedding = self.embedding_transform_1(embedding) + embedding = embedding.reshape( + (batch_size, self.num_repeat_tokens, self.encoder_hidden_dim) + ) + # + diff_embedding = self.embedding_transform_2(diff_embedding) + diff_embedding = diff_embedding.reshape( + (batch_size, self.num_repeat_tokens, self.encoder_hidden_dim) + ) + # + hypothesis_embedding = self.embedding_transform_3(hypothesis_embedding) + hypothesis_embedding = hypothesis_embedding.reshape( + (batch_size, self.num_repeat_tokens, self.encoder_hidden_dim) + ) + inputs_embeds = self.encoder_decoder.encoder.embed_tokens(hypothesis_input_ids) + # + ones = ops.ones( + (batch_size, 1), dtype=ms.bool_) + # TODO: pad_token_id or eos_token_id? Or does it not matter? + sep_token = ones * self.encoder_decoder.config.eos_token_id + sep_token = self.encoder_decoder.encoder.embed_tokens(sep_token) + # inputs_embeds = ops.cat((sep_token, embedding, sep_token, hypothesis_embedding, inputs_embeds), dim=1) + inputs_embeds = ops.cat( + ( + sep_token, + embedding, + sep_token, + hypothesis_embedding, + sep_token, + diff_embedding, + sep_token, + inputs_embeds, + ), + axis=1, + ) + if self.use_ln: + inputs_embeds = self.layernorm(inputs_embeds) + # attention_mask = ops.cat( + # (ones.tile(1, 4 + 3 * self.num_repeat_tokens), hypothesis_attention_mask), + # axis=1, + # ) + ones_repeated = ones.tile((1, 4 + 3 * self.num_repeat_tokens)) + + + + attention_mask = ops.cat( + (ones_repeated, hypothesis_attention_mask), + axis=1, + ) + # ones_repeated = ops.Tile()(ms.Tensor([1], dtype=ms.float32), (1, 4 + 3 * self.num_repeat_tokens)) + # attention_mask = ops.Concat(axis=1)((ones_repeated, hypothesis_attention_mask)) + return (inputs_embeds, attention_mask) + + def generate(self, inputs: Dict[str, ms.Tensor], generation_kwargs: Dict[str, ms.Tensor], + return_dict_in_generate: bool = False) -> ms.Tensor: + '''generate inversion embedding''' + + if "max_length" not in generation_kwargs: + generation_kwargs = copy.copy( + generation_kwargs + ) # make a copy so we can edit + generation_kwargs["max_length"] = inputs.get( + "input_ids", inputs["embedder_input_ids"] + ).shape[1] + + inputs_embeds, attention_mask = self.get_encoder_embedding( + embedding=inputs["frozen_embeddings"], + hypothesis_input_ids=inputs["hypothesis_input_ids"], + hypothesis_attention_mask=inputs["hypothesis_attention_mask"], + hypothesis_embedding=inputs["hypothesis_embedding"], + ) + + if "decoder_input_ids" in inputs: + return self.encoder_decoder.generate( + # required: input embeddings + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + return_dict_in_generate=return_dict_in_generate, + output_scores=return_dict_in_generate, + # optional: input IDs (for starting generation). + # typically not set unless generating prefixes for + # reranking. + decoder_input_ids=inputs["decoder_input_ids"], + # decoder_attention_mask=inputs["decoder_attention_mask"], + **generation_kwargs, + ) + + return self.encoder_decoder.generate( + # required: input embeddings + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + return_dict_in_generate=return_dict_in_generate, + output_scores=return_dict_in_generate, + # optional: input IDs (for starting generation). + # typically not set unless generating prefixes for + # reranking. + **generation_kwargs, + ) + + def forward(self, embedding: ms.Tensor, hypothesis_embedding, hypothesis_input_ids: ms.Tensor, + hypothesis_attention_mask: ms.Tensor, labels: Optional[ms.Tensor] = None,): + ''' + forward function + ''' + inputs_embeds, attention_mask = self.get_encoder_embedding( + embedding=embedding, + hypothesis_embedding=hypothesis_embedding, + hypothesis_input_ids=hypothesis_input_ids, + hypothesis_attention_mask=hypothesis_attention_mask, + ) + return self.encoder_decoder( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + labels=labels, + ) diff --git a/examples/privacy/embedding_inversion/vec2text/models/inversion.py b/examples/privacy/embedding_inversion/vec2text/models/inversion.py new file mode 100644 index 0000000000000000000000000000000000000000..25feccf45d6ed53ee612baf753ddc669dc9a51cc --- /dev/null +++ b/examples/privacy/embedding_inversion/vec2text/models/inversion.py @@ -0,0 +1,322 @@ +''' +inversion model +''' + +import copy +import logging +from typing import Dict, Optional + +import mindspore as ms +import mindspore.ops as ops +from mindnlp.transformers import PreTrainedModel, PreTrainedTokenizer, AutoModelForSeq2SeqLM, RobertaModel + +from .config import InversionConfig +from .model_utils import ( + FREEZE_STRATEGIES, + disable_dropout, + freeze_params, + load_embedder_and_tokenizer, + load_encoder_decoder, + load_tokenizer, + mean_pool, +) + + +logger = logging.getLogger(__name__) + + +class InversionModel(PreTrainedModel): + """A class of model that conditions on embeddings from a pre-trained sentence embedding model + to decode text autoregressively. + """ + + config_class = InversionConfig + embedder: ms.nn.Cell + embedder_tokenizer: PreTrainedTokenizer # embedder's tokenizer + encoder_decoder: AutoModelForSeq2SeqLM + encoder_decoder_lora: bool # Whether to use LoRA for the encoder-decoder model + tokenizer: PreTrainedTokenizer # encoder_decoder's tokenizer + embedding_transform: ms.nn.Cell # Module that transformers embedder output into encoder-decoder input + bottleneck_dim: int # Bottleneck dimension for embedding_transform + num_repeat_tokens: int # Sequence length for repeating embedder embedding for encoder-decoder input + embedder_dim: int # Hidden dimension of embedding model + embedder_no_grad: bool # Disable gradients for embedding model + embedder_fake_with_zeros: bool # Whether to just provide zeros as input for encoder-decoder (unconditional) + embedding_transform_strategy: str # Way to transform bottleneck embedding into input for encoder-decoder + use_frozen_embeddings_as_input: bool # Whether to train/evaluate on frozen embeddings + embedded_tokens: ms.Tensor # used for decoding + embedder_model_api: Optional[str] + + def __init__(self, config: InversionConfig): + super().__init__(config=config) + + embedder_model_api = config.embedder_model_api + embedder_fake_with_zeros = config.embedder_fake_with_zeros + use_frozen_embeddings_as_input = config.use_frozen_embeddings_as_input + encoder_dropout_disabled = config.encoder_dropout_disabled + decoder_dropout_disabled = config.decoder_dropout_disabled + embeddings_from_layer_n = config.embeddings_from_layer_n + + + encoder_decoder = load_encoder_decoder( + model_name=config.model_name_or_path, + lora=config.use_lora, + ) + + + embedder, embedder_tokenizer = load_embedder_and_tokenizer( + name=config.embedder_model_name, torch_dtype=config.embedder_torch_dtype + ) + + tokenizer = load_tokenizer( + config.model_name_or_path, + max_length=config.max_seq_length, + ) + num_repeat_tokens = config.num_repeat_tokens + embedder_no_grad = config.embedder_no_grad + + self.encoder_decoder = encoder_decoder # .to_bettertransformer() + self.num_repeat_tokens = num_repeat_tokens + + self.embedder_is_decoder = False + + encoder_hidden_dim = self.encoder_decoder.config.hidden_size + if embedder_model_api: + assert use_frozen_embeddings_as_input, "must precompute embeddings w/ api" + # Hard-code OpenAI embedding dim + self.embedder_dim = 1536 + bottleneck_dim = self.embedder_dim + # elif isinstance(embedder, mindnlp.transformers.models.t5.modeling_t5.T5ForConditionalGeneration): + # self.embedder_dim = embedder.get_sentence_embedding_dimension() + # bottleneck_dim = self.embedder_dim + else: + self.embedder_dim = embedder.config.hidden_size + bottleneck_dim = self.embedder_dim + self.embedder_no_grad = embedder_no_grad + self.use_frozen_embeddings_as_input = use_frozen_embeddings_as_input + self.bottleneck_dim = bottleneck_dim + + self.embedding_transform = ms.nn.SequentialCell( + ms.nn.Dense(self.embedder_dim, bottleneck_dim), + ms.nn.Dropout(self.encoder_decoder.config.dropout_rate), + ms.nn.GELU(), # TODO consider dropout or normalization here. + ms.nn.Dense(bottleneck_dim, encoder_hidden_dim * num_repeat_tokens), + ) + if encoder_dropout_disabled: + disable_dropout(self.encoder_decoder.encoder) + if decoder_dropout_disabled: + disable_dropout(self.encoder_decoder.decoder) + disable_dropout(self.encoder_decoder.lm_head) + ###################################################### + self.tokenizer = tokenizer + self.embedder = embedder + if self.embedder_no_grad: + for param in self.embedder.parameters(): + param.requires_grad = False + + self.embedder.eval() + + self.embedder_tokenizer = embedder_tokenizer + self.embedder_model_api = embedder_model_api + # self.freeze(freeze_strategy=config.freeze_strategy) + self.embedder_fake_with_zeros = embedder_fake_with_zeros + + self.embedding_transform_strategy = "repeat" # "none" # "repeat" + self.embeddings_from_layer_n = embeddings_from_layer_n + self.noise_level = 0 + + def _freeze_encoder(self): + freeze_params(self.encoder_decoder.encoder) + + def _freeze_decoder(self): + # github.com/huggingface/transformers/blob/master/src/transformers/models/t5/modeling_t5.py#L1229-L1231 + freeze_params(self.encoder_decoder.decoder) + freeze_params(self.encoder_decoder.lm_head) + + def freeze(self, freeze_strategy: str): + '''maybe freeze module of encoder_decoder for subsequent training''' + + assert freeze_strategy in FREEZE_STRATEGIES + + if freeze_strategy == "decoder": + self._freeze_decoder() + elif freeze_strategy == "encoder": + self._freeze_encoder() + elif freeze_strategy == "encoder_and_decoder": + self._freeze_encoder() + self._freeze_decoder() + # in this case, freeze embeddings too + freeze_params(self.encoder_decoder.shared) + elif freeze_strategy == "none": + pass + else: + raise ValueError(f"invalid freezing strategy {freeze_strategy}") + + def _process_embedder_output(self, outputs, attention_mask: ms.Tensor): + + '''process_embedder_output''' + + if hasattr(outputs, "pooler_output") and (outputs.pooler_output is not None): + return outputs.pooler_output + if self.embeddings_from_layer_n is not None: + assert hasattr( + outputs, "hidden_states" + ), "output missing hidden states - remember to initialize the model with output_hidden_states=True?" + hidden_state = outputs.hidden_states[self.embeddings_from_layer_n] + embeddings = mean_pool(hidden_state, attention_mask) + else: + hidden_state = outputs.last_hidden_state + embeddings = mean_pool(hidden_state, attention_mask) + return embeddings + + def call_embedding_model(self, input_ids: ms.Tensor, attention_mask: ms.Tensor, + token_type_ids: Optional[ms.Tensor] = None): + ''' + call_embedding_model + ''' + embedder = self.embedder + # print("** call_embedding_model") + if self.embedder_no_grad: + embedder.eval() + # pylint: disable=R1705 + if self.embedder_fake_with_zeros: + batch_size = input_ids.shape[0] + return ops.zeros( + (batch_size, self.embedder_dim), + dtype=ms.float32 + ) + + elif isinstance(self.embedder, RobertaModel): + #before : mindnlp.transformers.models.t5.modeling_t5.T5ForConditionalGeneration + #self.embedder : RobertaModel + # sentence-transformers is kind of really annoying + model_inputs = {"input_ids": input_ids, "attention_mask": attention_mask} + + if token_type_ids is not None: + model_inputs["token_type_ids"] = token_type_ids + # print(model_inputs) + # print(model_inputs['input_ids'].shape) + + + model_output = embedder(model_inputs['input_ids']) + + embeddings = self._process_embedder_output(model_output, attention_mask) + + else: + model_output = embedder(input_ids=input_ids, attention_mask=attention_mask) + embeddings = self._process_embedder_output(model_output, attention_mask) + + if self.noise_level > 0: + embeddings += self.noise_level * ops.randn( + embeddings.shape + ) + return embeddings + + def embed_and_project(self, embedder_input_ids: Optional[ms.Tensor], + embedder_attention_mask, + frozen_embeddings: Optional[ms.Tensor] = None,): + ''' + embed_and_project + ''' + assert not ((embedder_input_ids is None) and (frozen_embeddings is None)) + if frozen_embeddings is not None: + embeddings = frozen_embeddings + assert len(embeddings.shape) == 2 # batch by d + elif self.embedder_no_grad: + embeddings = self.call_embedding_model(input_ids=embedder_input_ids, + attention_mask=embedder_attention_mask,) + else: + embeddings = self.call_embedding_model( + input_ids=embedder_input_ids, + attention_mask=embedder_attention_mask, + ) + if self.embedding_transform_strategy == "repeat": + if embeddings.dtype != self.dtype: + embeddings = embeddings.astype(self.dtype) + repeated_embeddings = self.embedding_transform(embeddings) + # linear outputs a big embedding, reshape into a sequence of regular size embeddings. + embeddings = repeated_embeddings.reshape( + (*repeated_embeddings.shape[:-1], self.num_repeat_tokens, -1) + ) + elif self.embedding_transform_strategy == "nearest_neighbors": + # TODO + raise NotImplementedError() + else: + raise ValueError( + f"unknown embedding transformation strategy {self.embedding_transform_strategy}" + ) + attention_mask = ops.ones( + (embeddings.shape[0], embeddings.shape[1]), dtype=ms.float32) + return embeddings, attention_mask + + def generate(self, inputs: Dict[str, ms.Tensor], generation_kwargs: Dict[str, ms.Tensor],): + ''' + generate embedding + ''' + generation_kwargs = copy.copy(generation_kwargs) # make a copy so we can edit + inputs_embeds, attention_mask = self.embed_and_project( + embedder_input_ids=inputs.get("embedder_input_ids"), + embedder_attention_mask=inputs.get("embedder_attention_mask"), + # frozen_embeddings=inputs.get("frozen_embeddings"), + # embedder_input_ids=inputs[4], + # embedder_attention_mask=inputs[5], + + ) + + if "decoder_input_ids" in inputs: + return self.encoder_decoder.generate( + # required: input embeddings + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + # optional: input IDs (for starting generation). + # typically not set unless generating prefixes for + # reranking. + decoder_input_ids=inputs["decoder_input_ids"], + **generation_kwargs, + ) + + return self.encoder_decoder.generate( + # required: input embeddings + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + # optional: input IDs (for starting generation). + # typically not set unless generating prefixes for + # reranking. + **generation_kwargs, + ) + + + def generate_corrector(self, inputs: Dict[str, ms.Tensor], generation_kwargs: Dict[str, ms.Tensor],): + ''' + 因为数据格式不一样,所以corrector中的generate改成这个名字了generate_corrector + ''' + generation_kwargs = copy.copy(generation_kwargs) # make a copy so we can edit + print(inputs) + inputs_embeds, attention_mask = self.embed_and_project( + embedder_input_ids=inputs.get("embedder_input_ids"), + embedder_attention_mask=inputs.get("embedder_attention_mask"), + frozen_embeddings=inputs.get("frozen_embeddings"), + ) + + return self.encoder_decoder.generate(inputs_embeds=inputs_embeds, + attention_mask=attention_mask, **generation_kwargs,) + + + def forward(self, embedder_input_ids: ms.Tensor, embedder_attention_mask: ms.Tensor, + labels: Optional[ms.Tensor] = None, + frozen_embeddings: Optional[ms.Tensor] = None, decoder_input_ids: Optional[ms.Tensor] = None,): + ''' + forward function + ''' + # Unused: input_ids, attention_mask + inputs_embeds, attention_mask = self.embed_and_project( + embedder_input_ids=embedder_input_ids, + embedder_attention_mask=embedder_attention_mask, + frozen_embeddings=frozen_embeddings, + ) + return self.encoder_decoder( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + labels=labels, + decoder_input_ids=decoder_input_ids, + ) diff --git a/examples/privacy/embedding_inversion/vec2text/models/model_utils.py b/examples/privacy/embedding_inversion/vec2text/models/model_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..774eea40e7a3b1e15a1c7c5ea37d1aa18f2f8984 --- /dev/null +++ b/examples/privacy/embedding_inversion/vec2text/models/model_utils.py @@ -0,0 +1,168 @@ +''' +model utils for training +''' + +from typing import Any, Dict + +import mindspore as ms +from mindnlp.sentence import SentenceTransformer +from mindnlp.transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM, \ + PreTrainedTokenizer + +EMBEDDER_MODEL_NAMES = [ + "bert", + "bert__random_init", + "contriever", + "dpr", + "gtr_base", + "gtr_base__random_init", + "medicalai/ClinicalBERT", + "gtr_large", + "ance_tele", + "dpr_st", + "gtr_base_st", + "paraphrase-distilroberta", + "sentence-transformers/all-MiniLM-L6-v2", + "meta-llama/Llama-2-7b-hf", + "meta-llama/Llama-2-13b-hf", + "meta-llama/Llama-2-7b-chat-hf", + "meta-llama/Llama-2-13b-chat-hf", + "nomic-ai/nomic-embed-text-v1", + "gpt2", + "gpt2-medium", + "gpt2-large", + "gpt2-xl", +] + + +FREEZE_STRATEGIES = ["decoder", "encoder_and_decoder", "encoder", "none"] +EMBEDDING_TRANSFORM_STRATEGIES = ["repeat"] + + +device = ms.get_context("device_target") + + +def disable_dropout(model: ms.nn.Cell): + dropout_modules = [m for m in model.modules() if isinstance(m, ms.Dropout)] + for m in dropout_modules: + m.p = 0.0 + print( + f"Disabled {len(dropout_modules)} dropout modules from model type {type(model)}" + ) + + +def freeze_params(model: ms.nn.Cell): + total_num_params = 0 + for _, params in model.named_parameters(): + params.requires_grad = False + total_num_params += params.numel() + # print(f"Froze {total_num_params} params from model type {type(model)}") + + +def mean_pool(hidden_states: ms.Tensor, attention_mask: ms.Tensor): + b, _, d = hidden_states.shape + unmasked_outputs = hidden_states * attention_mask[..., None] + pooled_outputs = unmasked_outputs.sum(axis=1) / attention_mask.sum(axis=1)[:, None] + assert pooled_outputs.shape == (b, d) + return pooled_outputs + + +def max_pool(hidden_states: ms.Tensor, attention_mask: ms.Tensor) -> ms.Tensor: + b, _, d = hidden_states.shape + unmasked_outputs = hidden_states * attention_mask[..., None] + pooled_outputs = unmasked_outputs.max(axis=1) + assert pooled_outputs.shape == (b, d) + return pooled_outputs + + +def stack_pool(hidden_states: ms.Tensor, attention_mask: ms.Tensor): + b, s, d = hidden_states.shape + unmasked_outputs = hidden_states * attention_mask[..., None] + pooled_outputs = unmasked_outputs.reshape((b, s * d)) # stack along seq length + assert pooled_outputs.shape == (b, s * d) + return pooled_outputs + + +def load_embedder_and_tokenizer(name: str, torch_dtype: str):# pylint: disable=W0613 + + ''' + TODO make abstract/argparse for it etc. + name = "gpt2" #### <--- TEMP. For debugging. Delete! + ''' + model_kwargs = { + #"low_cpu_mem_usage": True, # Not compatible with DeepSpeed + "output_hidden_states": False, + } + + if name == "gtr_base": + print("gtr-t5-base is regarded as embedder model......") + model = AutoModel.from_pretrained( + "sentence-transformers/gtr-t5-base", **model_kwargs + ).encoder + tokenizer = AutoTokenizer.from_pretrained( + "sentence-transformers/gtr-t5-base" + ) + elif name == "paraphrase-distilroberta": + model = AutoModel.from_pretrained( + "sentence-transformers/paraphrase-distilroberta-base-v1", **model_kwargs + ) + tokenizer = AutoTokenizer.from_pretrained( + "sentence-transformers/paraphrase-distilroberta-base-v1" + ) + # elif name == "paraphrase-distilroberta": + # tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base") + # model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base") + elif name == "medicalai/ClinicalBERT": + model = AutoModel.from_pretrained( + "medicalai/ClinicalBERT", **model_kwargs + ) + tokenizer = AutoTokenizer.from_pretrained("medicalai/ClinicalBERT") + elif name.startswith("gpt2"): + model = AutoModelForCausalLM.from_pretrained( + name, + **model_kwargs, + ) + # model.to_bettertransformer() + tokenizer = AutoTokenizer.from_pretrained(name) + tokenizer.pad_token = tokenizer.eos_token + + elif name.startswith("sentence-transformers/"): + model = SentenceTransformer(name) + tokenizer = model.tokenizer + + else: + print(f"WARNING: Trying to initialize from unknown embedder {name}") + model = AutoModel.from_pretrained(name, **model_kwargs) + tokenizer = AutoTokenizer.from_pretrained(name) + + # model = torch.compile(model) + return model, tokenizer + +# pylint: disable=W0613 +def load_encoder_decoder(model_name: str, lora: bool = False): + model_kwargs: Dict[str, Any] = { + #"low_cpu_mem_usage": True,z + } + return AutoModelForSeq2SeqLM.from_pretrained( + model_name, **model_kwargs + ) + + +def load_tokenizer(name: str, max_length: int) -> PreTrainedTokenizer: + ''' + load tokenizer + ''' + tokenizer = AutoTokenizer.from_pretrained( + name, + padding="max_length", + truncation="max_length", + max_length=max_length, + ) + + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + + # Disable super annoying warning: + # https://github.com/huggingface/transformers/issues/22638 + tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True + return tokenizer diff --git a/examples/privacy/embedding_inversion/vec2text/precompute_train_hypotheses.py b/examples/privacy/embedding_inversion/vec2text/precompute_train_hypotheses.py new file mode 100644 index 0000000000000000000000000000000000000000..cc8bcf8d2a52e1addc1008d4f619ed16de26d61c --- /dev/null +++ b/examples/privacy/embedding_inversion/vec2text/precompute_train_hypotheses.py @@ -0,0 +1,79 @@ +''' +precompute train hypotheses +''' +import argparse +import glob +import os + +import aliases +import datasets +import tqdm + + +def precompute(start_idx: int, num_samples: int): + '''precompute''' + + out_path = (f"/home/jxm3/research/retrieval/inversion/msmarco_msl128_hypotheses/" + f"msmarco_{num_samples}_{start_idx}.arrow") + if os.path.exists(out_path): + print("already precomputed; exiting") + # load the previously-trained msmarco model + _, trainer = aliases.load_experiment_and_trainer_from_alias( + "openai_msmarco__msl128__100epoch__correct", + max_seq_length=128, + use_less_data=-1, + ) + + end_idx = min(len(trainer.train_dataset), start_idx + num_samples) + print("Original length:", len(trainer.train_dataset)) + trainer.train_dataset = trainer.train_dataset.select(range(start_idx, end_idx)) + print("Sampled length:", len(trainer.train_dataset)) + hypothesis_path = trainer.precompute_hypotheses() + os.symlink(hypothesis_path, out_path) + print( + f"precomputed {num_samples} samples from msmarco from idx {start_idx} and saved to {out_path}" + ) + + +def gather(): + '''gather''' + n_samples = 136772 # gather all files that have this many samples + files = sorted( + glob.glob( + f"/home/jxm3/research/retrieval/inversion/msmarco_msl128_hypotheses/msmarco_{n_samples}_*" + ) + ) + gathered_dataset_path = "/home/jxm3/research/retrieval/inversion/msmarco_msl128_hypotheses/msmarco_full.cache" + datasets_list = [] + print(f"found {len(files)} files to concatenate.") + print(f"\t first three: {files[:3]}") + for f in tqdm.tqdm(files, desc="loading datasets"): + datasets_list.append(datasets.Dataset.load_from_disk(f)) + print("concatenating") + full_dataset = datasets.concatenate_datasets(datasets_list) + print("and...saving.") + full_dataset.save_to_disk(gathered_dataset_path) + print(f"gathered {len(datasets_list)} and saved to {gathered_dataset_path}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="precompute MSMARCO hypotheses") + parser.add_argument("--start_idx", type=int, required=True, help="Starting index") + parser.add_argument( + "--num_samples", type=int, required=True, help="Number of samples" + ) + parser.add_argument( + "--work", + type=str, + required=False, + default="precompute", + choices=["precompute", "gather"], + help="type of work to do", + ) + + args = parser.parse_args() + + if args.work == "precompute": + precompute(args.start_idx, args.num_samples) + else: + gather() diff --git a/examples/privacy/embedding_inversion/vec2text/requirements.txt b/examples/privacy/embedding_inversion/vec2text/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..367d4af6b83fd20fdd2fd4898618cc27de2d5432 --- /dev/null +++ b/examples/privacy/embedding_inversion/vec2text/requirements.txt @@ -0,0 +1,16 @@ +accelerate +bert_score +datasets +einops +evaluate +openai +optimum +pre-commit +pylcs +rouge_score +sacrebleu +sentence_transformers +tenacity +tokenizers +tqdm +transformers diff --git a/examples/privacy/embedding_inversion/vec2text/run.py b/examples/privacy/embedding_inversion/vec2text/run.py new file mode 100644 index 0000000000000000000000000000000000000000..8ab82e3bc7f64ae2c6daed33fbf36e674100f98a --- /dev/null +++ b/examples/privacy/embedding_inversion/vec2text/run.py @@ -0,0 +1,37 @@ +''' +program start point +''' + +from datetime import datetime +from experiments import experiment_from_args +from run_args import parse_arguments, parse_args_into_dataclasses + +# #为了在gpu环境下使用mindspore2.4版本 +# os.environ['CUDA_HOME'] = '/luoyf' +# +# os.environ["WANDB_DISABLED"] = "true" +# os.environ['CUDA_VISIBLE_DEVICES'] = '1' +# #连接clash挂梯子必备 +# os.environ['http_proxy'] = 'http://127.0.0.1:7890' +# os.environ['https_proxy'] = 'http://127.0.0.1:7890' +# os.environ['all_proxy'] = 'socks5://127.0.0.1:7890' + + +def main(): + # ms.set_context(device_target="Ascend") + # ms.context.set_context(mode=ms.context.PYNATIVE_MODE) + # ms.context.set_context(mode=ms.context.GRAPH_MODE) + # model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14", from_tf=True) + args = parse_arguments() + #traing_args may not be a normal datacla + # ss, and then should be adapted to the new one. + model_args, data_args, training_args = parse_args_into_dataclasses(args) + experiment = experiment_from_args(model_args, data_args, training_args) + print("beginning time:") + print(datetime.now()) + experiment.run() + + + +if __name__ == "__main__": + main() diff --git a/examples/privacy/embedding_inversion/vec2text/run_args.py b/examples/privacy/embedding_inversion/vec2text/run_args.py new file mode 100644 index 0000000000000000000000000000000000000000..d14e854d6fba559d3003aad0612f4fc1d4801dd6 --- /dev/null +++ b/examples/privacy/embedding_inversion/vec2text/run_args.py @@ -0,0 +1,484 @@ +# pylint: disable=E1123 +''' +parse command to arguments +''' +import os +from dataclasses import dataclass, field +from typing import Optional, Tuple +import argparse + +import mindnlp.engine + + +DATASET_NAMES = [ + "nq", + "luar_reddit", + "msmarco", + "one_million_instructions", + "one_million_paired_instructions", +] + + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. + """ + model_name_or_path: str = field( + ### + ## huggingface.co/facebook/dpr-ctx_encoder-single-nq-base + ### + default="t5-base", + metadata={ + "help": ( + "The model checkpoint for weights initialization .Don't set if you want to train a model from scratch." + ) + }, + ) + embedder_model_name: str = field( + ### + ## huggingface.co/facebook/dpr-ctx_encoder-single-nq-base + ### + default="gtr_base", + metadata={ + "help": "Model to get embeddings from (locally)", + # "choices": EMBEDDER_MODEL_NAMES, + }, + ) + embedder_model_api: Optional[str] = field( + default=None, metadata={"help": "API to get embeddings from"} + ) + embedder_torch_dtype: str = field( + default="float32", + metadata={ + "help": "torch dtype of embedder", + "choices": ["float32", "float16", "bfloat16"], + }, + ) + embedding_transform_strategy: str = field( + default="repeat", + metadata={ + "help": "Strategy for transforming from sentence embedding into sequence-level input for encoder-decoder", + # "choices": EMBEDDING_TRANSFORM_STRATEGIES, + }, + ) + encoder_dropout_disabled: bool = field( + default=False, metadata={"help": "Disable dropout on T5 encoder"} + ) + decoder_dropout_disabled: bool = field( + default=False, metadata={"help": "Disable dropout on T5 decoder"} + ) + + # model_type: Optional[str] = field( + # default=None, + # metadata={ + # "help": "If training from scratch, pass a model type from the list: " + # + ", ".join(MODEL_TYPES) + # }, + # ) + config_overrides: Optional[str] = field( + default=None, + metadata={ + "help": ( + "Override some existing default config settings when a model is trained from scratch. Example: " + "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index" + ) + }, + ) + config_name: Optional[str] = field( + default=None, + metadata={ + "help": "Pretrained config name or path if not the same as model_name" + }, + ) + tokenizer_name: Optional[str] = field( + default=None, + metadata={ + "help": "Pretrained tokenizer name or path if not the same as model_name" + }, + ) + cache_dir: Optional[str] = field( + default=None, + metadata={ + "help": "Where do you want to store the pretrained models downloaded from huggingface.co" + }, + ) + model_revision: str = field( + default="main", + metadata={ + "help": "The specific model version to use (can be a branch name, tag name or commit id)." + }, + ) + max_seq_length: int = field( + default=128, metadata={"help": "Maximum sequence length for tokenizer"} + ) + torch_dtype: Optional[str] = field( + default=None, + metadata={ + "help": ( + "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the " + "dtype will be automatically derived from the model's weights." + ), + "choices": ["auto", "bfloat16", "float16", "float32"], + }, + ) + num_repeat_tokens: int = field( + default=16, + metadata={ + "help": "Number of times to repeat embedding along T5 input sequence length." + }, + ) + embedding_zero_except_topk: Optional[int] = field( + default=None, + metadata={ + "help": "For inverting with logits, will set all numbers in embedding except the top-K to -30." + }, + ) + embedder_no_grad: bool = field( + default=True, metadata={"help": "Whether to disable grads for DPR"} + ) + use_lora: bool = field( + default=False, metadata={"help": "Whether to use LORA+int8 for fine-tuning"} + ) + embedder_fake_with_zeros: bool = field( + default=False, + metadata={ + "help": "Whether to pass all zeros as embedding (and not use DPR at all)" + }, + ) + + use_frozen_embeddings_as_input: bool = field( + default=False, + metadata={ + "help": "Whether to pass a 'frozen_embedding' column and train" + }, + ) + corrector_ignore_hypothesis_embedding: bool = field( + default=False, + metadata={ + "help": "If set, and training corrector encoder, will ignore the hypothesis embedding" + }, + ) + embeddings_from_layer_n: Optional[int] = field( + default=None, + metadata={ + "help": "If set, uses embeddings from layer n - for example set to 0 to use word embeddings" + }, + ) + freeze_strategy: str = field( + default="none", + metadata={ + "help": "which part of the model to freeze", + # "choices": FREEZE_STRATEGIES, + }, + ) + + def __post_init__(self): + if self.config_overrides is not None and ( + self.config_name is not None or self.model_name_or_path is not None + ): + raise ValueError( + "--config_overrides can't be used in combination with --config_name or --model_name_or_path" + ) + + +@dataclass +class DataArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + """ + # datamap: bool = field( + # default="True", + # ) + dataset_name: Optional[str] = field( + default="msmarco", + metadata={ + "choices": DATASET_NAMES, + "help": "The name of the dataset to use (via the datasets library).", + }, + ) + max_eval_samples: int = field( + default=1000, + metadata={ + "help": ( + "For debugging purposes or quicker training, truncate the number of evaluation examples to this " + "value if set." + ) + }, + ) + use_less_data: int = field( + default=-1, + metadata={ + "help": {"Use a small amount of the training/eval data (for testing)"} + }, + ) + + def __post_init__(self): + if self.dataset_name is None: + raise ValueError("Need a dataset name.") + + +@dataclass +class TrainingArguments(mindnlp.engine.TrainingArguments): + ''' + some training arguments + ''' + # https://github.com/huggingface/transformers/blob/e82c1cb78e178519060b9391214727be75a218ca/src/transformers/training_args.py#L121 + output_dir: Optional[str] = field( + default=None, + metadata={ + "help": "Output directory for training saves. If not set, will output to saves/." + }, + ) + corrector_model_alias: Optional[str] = field( + default=None, + metadata={"help": "Alias of corrector model to train (defined in aliases.py)"}, + ) + corrector_model_from_pretrained: Optional[str] = field( + default=None, + metadata={ + "help": "Alias of pre-trained corrector model to train (defined in aliases.py)" + }, + ) + cheat_on_train_hypotheses: bool = field( + default=False, + metadata={ + "help": "When set, will interpolate true with pred train hypothesis for 'closer' training data" + }, + ) + + steps_per_epoch: int = field( + default=500_000, + metadata={"required": False, "help": "Size of pseudo-training set."}, + ) + num_train_epochs: float = field( + default=30.0, + metadata={"required": False, "help": "Number of epochs for training"}, + ) + learning_rate: float = field( + default=2e-5, + metadata={"help": "The initial learning rate for AdamW on the backbone model."}, + ) + use_wandb: Optional[bool] = field( + default=None, metadata={"help": "Whether or not to log to Weights & Biases."} + ) + report_to: str = "wandb" + per_device_train_batch_size: int = field( + default=128, metadata={"help": "Batch size per GPU/TPU core/CPU for training."} + ) + bf16: bool = field( + default=False, + metadata={"help": ("Whether to use bf16 (mixed) precision instead of 32-bit.")}, + ) + # torch_compile: bool = True # for torch 2 + + ##################### Experimental Settings #################### + experiment: str = field( + default="inversion", + metadata={ + "required": False, + "help": "Which experiment to run (defines model, loss func, dataset...) ", + "choices": [ + "inversion", # our model: projects and feeds to encoder-decoder + "inversion_from_logits", + "inversion_from_logits_emb", + "inversion_decoder_only", # baseline: use single embedding as input to a decoder + "inversion_bow", + "inversion_na", + "reranking", + "corrector", + "corrector_encoder", + ], + }, + ) + exp_name: str = field( + default="", + metadata={ + "required": False, + "help": "Name to identify this specific run of an experiment", + }, + ) + + exp_group_name: str = field( + default="", + metadata={ + "required": False, + "help": "Name to identify this sweep / series of experiments", + }, + ) + + # Need to *not* remove unused columns so we keep query_attention_mask, etc. + # which huggingface doesn't think we need. + remove_unused_columns: bool = False + + # Do evaluation and logging on certain num steps. + evaluation_strategy: str = "steps" + logging_strategy: str = "steps" + save_strategy: str = "steps" + + save_total_limit: int = 2 # Maximum number of checkpoints to save. + + warmup_steps: int = field( + default=4000, metadata={"help": "Number of steps of warmup"} + ) + logging_steps: int = field( + default=400, metadata={"help": "Number of steps between logging metrics"} + ) + save_steps: int = field( + default=4000, + metadata={"help": "Number of steps per save"}, + ) + eval_steps: int = field( + default=8, + metadata={ + "help": "Number of steps between eval (will be scaled as if batch size is 32)" + }, + ) + mock_embedder: bool = field( + default=False, + metadata={ + "help": ( + "If true, will delete the embedder and replace all embedder logits with" + " zeros once training starts. You probably don't want to do this. But " + " if you precomputed all the embeddings for train and val, this will" + " work fine, except the embedding-based metrics (just cosine similarity" + " I think) will be broken." + ) + }, + ) + ddp_find_unused_parameters: Optional[bool] = field( + default=False, + metadata={ + "help": ( + "When using distributed training, the value of the flag `find_unused_parameters` passed to " + "`DistributedDataParallel`." + ) + }, + ) + + include_inputs_for_metrics: bool = False + + def __post_init__(self): + super().__post_init__() + self._frozen = True + self.report_to = ( + ["wandb"] if (self.use_wandb and (self.local_rank <= 0)) else [] + ) + self.dataloader_pin_memory = True + # num_workers = torch.cuda.device_count() + num_workers = 1 + os.environ["RAYON_RS_NUM_CPUS"] = str( + num_workers + ) # Sets threads for hf tokenizers + self.dataloader_num_workers = num_workers + print(f"Set num workers to {num_workers}") + + self.dataloader_drop_last = False + + # Scale logging steps proportional to batch size. + self.warmup_steps = round(self.warmup_steps * (32 / self.train_batch_size)) + self.logging_steps = round(self.logging_steps * (32 / self.train_batch_size)) + self.eval_steps = round(self.eval_steps * (32 / self.train_batch_size)) + self.save_steps = round(self.save_steps * (32 / self.train_batch_size)) + + # defaults from SentenceTransformers + # lr 2e-5 + self.adam_epsilon = 1e-6 + + self.group_by_length = True + self.length_column_name = "length" + + self.load_best_model_at_end = True + self.greater_is_better = False + + self.do_eval = False + # self.ddp_backend = "gloo" + +def parse_arguments(): + ''' + get arguments from command line + ''' + parser = argparse.ArgumentParser() + parser.add_argument("--per_device_train_batch_size", type=int, required=True) + parser.add_argument("--per_device_eval_batch_size", type=int, required=True) + parser.add_argument("--max_seq_length", type=int, required=True) + parser.add_argument("--model_name_or_path", type=str, required=True) + parser.add_argument("--dataset_name", type=str, required=True) + parser.add_argument("--embedder_model_name", type=str, required=True) + parser.add_argument("--num_repeat_tokens", type=int, required=True) + parser.add_argument("--embedder_no_grad", type=bool, default=False, help="Whether to disable gradients") + parser.add_argument("--num_train_epochs", type=int, required=True) + parser.add_argument("--max_eval_samples", type=int, required=True) + parser.add_argument("--eval_steps", type=int, required=True) + parser.add_argument("--warmup_steps", type=int, required=True) + parser.add_argument("--bf16", type=int, choices=[0, 1], help="Use bf16 precision if set to 1") + parser.add_argument("--use_frozen_embeddings_as_input", type=bool, default=False) + parser.add_argument("--experiment", type=str, required=True) + # parser.add_argument("--lr_scheduler_type", type=str, required=True) + parser.add_argument("--learning_rate", type=float, required=True) + parser.add_argument("--output_dir", type=str, required=True) + parser.add_argument("--save_steps", type=int, required=True) + parser.add_argument("--use_less_data", type=int, required=True) + # parser.add_argument("--datamap", type=bool, required=False) + # corrector parameters + parser.add_argument("--corrector_model_alias", type=str, help="Optional alias for the corrector model") + + args = parser.parse_args() + + return args + + +def parse_args_into_dataclasses(args: argparse.Namespace) -> Tuple[ModelArguments, DataArguments, TrainingArguments]: + ''' + input args to dataclass aligning with HfArgParser in vec2text + ''' + + + if hasattr(args, 'corrector_model_alias') and args.corrector_model_alias: + print("diyici:") + print(args) + # pylint: disable=E1123 + training_args = TrainingArguments( + output_dir=args.output_dir, + num_train_epochs=args.num_train_epochs, + learning_rate=args.learning_rate, + per_device_train_batch_size=args.per_device_train_batch_size, + eval_steps=args.eval_steps, + warmup_steps=args.warmup_steps, + bf16=bool(args.bf16), + experiment=args.experiment, + save_steps=args.save_steps, + corrector_model_alias=args.corrector_model_alias, + disable_tqdm=False,# pylint: disable=E1123 + # 动态加入该参数 + ) + else: + # pylint: disable=E1123 + training_args = TrainingArguments( + output_dir=args.output_dir, + num_train_epochs=args.num_train_epochs, + learning_rate=args.learning_rate, + per_device_train_batch_size=args.per_device_train_batch_size, + eval_steps=args.eval_steps, + warmup_steps=args.warmup_steps, + bf16=bool(args.bf16), + experiment=args.experiment, + save_steps=args.save_steps, + disable_tqdm=False,# pylint: disable=E1123 + ) + model_args = ModelArguments( + model_name_or_path=args.model_name_or_path, + embedder_model_name=args.embedder_model_name, + num_repeat_tokens=args.num_repeat_tokens, + use_frozen_embeddings_as_input=args.use_frozen_embeddings_as_input, + embedder_no_grad=args.embedder_no_grad, + max_seq_length=args.max_seq_length, + ) + + data_args = DataArguments( + dataset_name=args.dataset_name, + max_eval_samples=args.max_eval_samples, + use_less_data=args.use_less_data, + # datamap=args.datamap, + ) + return model_args, data_args, training_args diff --git a/examples/privacy/embedding_inversion/vec2text/tokenize_data.py b/examples/privacy/embedding_inversion/vec2text/tokenize_data.py new file mode 100644 index 0000000000000000000000000000000000000000..10fd4e2f050189497ad955f00812b450cde8760a --- /dev/null +++ b/examples/privacy/embedding_inversion/vec2text/tokenize_data.py @@ -0,0 +1,372 @@ +''' +tokenize data +''' + +from dataclasses import dataclass +from typing import Callable, Dict, Optional, Union, Any + +import numpy as np +import mindspore as ms +import mindspore.numpy as mnp +from mindnlp.transformers import PreTrainedTokenizerBase +from mindnlp.utils import PaddingStrategy +from mindnlp.transformers.tokenization_utils_fast import PreTrainedTokenizer +from mindnlp import transformers + +from models import InversionModel + + + +# pylint disable: C0330 +def tokenize_function(tokenizer: PreTrainedTokenizer, embedder_tokenizer: PreTrainedTokenizer, + max_seq_length: int, padding: bool = False,) -> Callable[[Dict], Dict]: + ''' + tokenize_function + ''' + def tokenize_function_inner(examples) -> Dict[str, ms.Tensor]: + try: + texts = examples + output = tokenizer( + texts, + padding=padding, + truncation=True, + max_length=max_seq_length, + return_tensors='np' + ) + + output['input_ids'] = output['input_ids'][0] + output['attention_mask'] = output['attention_mask'][0] + output_labels_list = [ + (-100 if token_id == tokenizer.pad_token_id else token_id) for token_id in output["input_ids"] + ] + output["labels"] = np.array(output_labels_list) + + # 计算有效长度并生成 length 数组 + count_of_ones = sum(output["attention_mask"]) + output["length"] = np.array([count_of_ones]) + # mask = output["input_ids"] == tokenizer.pad_token_id + # labels = mnp.where(mask, ms.tensor(-100), output["input_ids"]) + # output["labels"] = labels + + # count_of_ones = mnp.sum(output["attention_mask"]) + # output["length"] = count_of_ones + # print(count_of_ones.asnumpy().item()) + + + + embedder_output = embedder_tokenizer( + texts, + padding="max_length", + truncation=True, + max_length=max_seq_length, + return_tensors="np", + ) + embedder_output['input_ids'] = embedder_output['input_ids'][0] + embedder_output['attention_mask'] = embedder_output['attention_mask'][0] + + + + # embedder_output = {f"embedder_{key}": value.asnumpy().tolist() for key, value in embedder_output.items()} + embedder_output = {f"embedder_{key}": value for key, value in embedder_output.items()} + # print("--------------------------------------------------------------------") + # print({**output, **embedder_output}) + return {**output, **embedder_output} + except Exception as e: + print(f"Error during processing: {e}") + raise # Re-throw the exception after logging + + return tokenize_function_inner + + +def tokenize_function_(tokenizer: PreTrainedTokenizer, embedder_tokenizer: PreTrainedTokenizer, + max_seq_length: int, padding: bool = False,) -> Callable[[Dict], Dict]: + '''tokenize_function''' + def tokenize_function_inner(examples) -> Dict[str, ms.Tensor]: + try: + texts = examples + + output = tokenizer( + texts, + padding=padding, + truncation=True, + max_length=max_seq_length, + return_tensors='ms' + ) + + # print("output的值是:") + # print(output) + # print("------------end---------------") + output['input_ids'] = output['input_ids'][0] + output['attention_mask'] = output['attention_mask'][0] + output_labels_list = [ + (-100 if token_id == tokenizer.pad_token_id else token_id) for token_id in output["input_ids"] + ] + output["labels"] = np.array(output_labels_list) + + # 计算有效长度并生成 length 数组 + # count_of_ones = sum(output["attention_mask"]) + # output["length"] = np.array([count_of_ones]) + mask = output["input_ids"] == tokenizer.pad_token_id + labels = mnp.where(mask, ms.tensor(-100), output["input_ids"]) + output["labels"] = labels + + count_of_ones = mnp.sum(output["attention_mask"]) + output["length"] = [count_of_ones.asnumpy().item()] + # print("++++++++++++++++++++++++++++++++++++++++") + # print(count_of_ones.asnumpy().item()) + # print("-----------------------------------------") + + + embedder_output = embedder_tokenizer( + texts, + padding="max_length", + truncation=True, + max_length=max_seq_length, + return_tensors="ms", + ) + embedder_output['input_ids'] = embedder_output['input_ids'][0] + embedder_output['attention_mask'] = embedder_output['attention_mask'][0] + + + + # embedder_output = {f"embedder_{key}": value.asnumpy().tolist() for key, value in embedder_output.items()} + embedder_output = {f"embedder_{key}": value for key, value in embedder_output.items()} + # print("--------------------------------------------------------------------") + # print({**output, **embedder_output}) + return {**output, **embedder_output} + except Exception as e: + print(f"Error during processing: {e}") + raise # Re-throw the exception after logging + + return tokenize_function_inner + + +def embed_dataset_batch(model: InversionModel, batch: Dict) -> Dict: + ''' + embed_dataset_batch + ''' + assert "input_ids" in batch.keys(), f"invalid keys {batch.keys()}" + assert hasattr(model, "call_embedding_model") + + input_ids = batch["input_ids"] + inputs_str = model.tokenizer.batch_decode(input_ids, skip_special_tokens=True) + emb_input_ids = model.embedder_tokenizer( + inputs_str, + max_length=model.config.max_seq_length, + truncation=True, + padding="max_length", + return_tensors="ms", + ) + + model.set_train(False) + batch["frozen_embeddings"] = model.call_embedding_model(**emb_input_ids) + model.set_train(True) + return batch + +# pylint: disable=W0613 +def get_tokenizer_mapping(lm: str, inverter: str, inverter_vocab_size: int) -> ms.Tensor: + """Computes the mapping from token outputs in `lm`'s vocabulary to those in `inverter's + vocabulary. Makes some assumptions about spacing. + """ + lm_tokenizer = transformers.AutoTokenizer.from_pretrained(lm) + inverter_tokenizer = transformers.AutoTokenizer.from_pretrained(inverter) + + lm_vocab = lm_tokenizer.vocab + mapping = ms.ops.zeros(len(lm_vocab), dtype=ms.int64) + for k, idx in lm_tokenizer.vocab.items(): + # We replace space tokens with nothing and allow the call to + # inverter_tokenizer.decode to determine this. We also + # filter out 2 and 3 as first tokens which are extremely common + # when the T5 tokenizer processes unicode. (These are hacks + # specific to the LLAMA-T5 lm-inverter pairing, and it would + # be better to find an automated wa to do this later.) + mapping[idx] = inverter_tokenizer.encode(k.replace("▁", " "))[0] + if mapping[idx] in [2, 3]: + mapping[idx] = inverter_tokenizer.encode(k.replace("▁", " "))[1] + + preservation = len(set(mapping.tolist())) / len(lm_vocab) + print( + f"Mapped tokenizer {lm} to {inverter}. Preserved {preservation*100:.1f}% of unique tokens." + ) + return mapping + +def pad_without_fast_tokenizer_warning(tokenizer, *pad_args, **pad_kwargs): + """ + Pads without triggering the warning about how using the pad function is sub-optimal when using a fast tokenizer. + """ + + # To avoid errors when using Feature extractors + if not hasattr(tokenizer, "deprecation_warnings"): + return tokenizer.pad(*pad_args, **pad_kwargs) + + # Save the state of the warning, then disable it + warning_state = tokenizer.deprecation_warnings.get("Asking-to-pad-a-fast-tokenizer", False) + tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True + + try: + padded = tokenizer.pad(*pad_args, **pad_kwargs) + finally: + # Restore the state of the warning. + tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = warning_state + + return padded + + +# convert to current assignment without too much change from transformer library of huggingface +# lizard: ignore=CYCLOMATIC_COMPLEXITY +@dataclass +class DataCollatorForSeq2Seq: + """ + Data collator that will dynamically pad the inputs received, as well as the labels. + + Args: + tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]): + The tokenizer used for encoding the data. + model ([`PreTrainedModel`], *optional*): + The model that is being trained. If set and has the *prepare_decoder_input_ids_from_labels*, use it to + prepare the *decoder_input_ids* + + This is useful when using *label_smoothing* to avoid calculating loss twice. + padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`): + Select a strategy to pad the returned sequences (according to the model's padding side and padding index) + among: + + - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single + sequence is provided). + - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum + acceptable input length for the model if that argument is not provided. + - `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different lengths). + max_length (`int`, *optional*): + Maximum length of the returned list and optionally padding length (see above). + pad_to_multiple_of (`int`, *optional*): + If set will pad the sequence to a multiple of the provided value. + + This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= + 7.5 (Volta). + label_pad_token_id (`int`, *optional*, defaults to -100): + The id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions). + return_tensors (`str`, *optional*, defaults to `"pt"`): + The type of Tensor to return. Allowable values are "np", "pt" and "tf". + """ + + tokenizer: PreTrainedTokenizerBase + model: Optional[Any] = None + padding: Union[bool, str, PaddingStrategy] = True + max_length: Optional[int] = None + pad_to_multiple_of: Optional[int] = None + label_pad_token_id: int = -100 + return_tensors: str = "ms" + + def __call__(self, features, return_tensors=None): + '''call func, reconstruct to form a func to meet with CCN restriction''' + # 确定返回的 tensor 类型 + return_tensors = return_tensors or self.return_tensors + + # 获取 labels 键名 + label_name = self._get_label_name(features) + + # 提取 labels 和非 labels 特征 + labels, non_labels_features = self._extract_labels_and_features(features, label_name) + + # 使用 tokenizer 对非标签特征进行处理 + batch = self._process_features(non_labels_features, return_tensors) + + # 手动填充 labels + if labels is not None: + batch["labels"] = self._process_labels(labels, features, label_name) + + # 处理返回 tensor 类型 + batch = self._convert_labels_to_tensor(batch, return_tensors) + + # 准备 decoder_input_ids + if self._requires_decoder_input_ids(labels): + batch["decoder_input_ids"] = self.model.prepare_decoder_input_ids_from_labels(labels=batch["labels"]) + + return batch + + def _get_label_name(self, features): + """获取标签名称,如果有 'label' 则使用 'label' 否则使用 'labels'""" + return "label" if "label" in features[0].keys() else "labels" + + def _extract_labels_and_features(self, features, label_name): + """提取标签和非标签特征""" + labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None + # 将 [None] 转换为 None + if labels and all(label is None for label in labels): + labels = None + non_labels_features = [{k: v for k, v in feature.items() if k != label_name} for feature in features] + return labels, non_labels_features + + def _process_features(self, non_labels_features, return_tensors): + """使用 tokenizer 处理特征""" + return pad_without_fast_tokenizer_warning( + self.tokenizer, + non_labels_features, + padding=self.padding, + max_length=self.max_length, + pad_to_multiple_of=self.pad_to_multiple_of, + return_tensors=return_tensors, + ) + + def _process_labels(self, labels, features, label_name): + """手动填充 labels""" + no_padding = self.padding is False or self.padding == PaddingStrategy.DO_NOT_PAD + if no_padding: + return self._handle_no_padding(labels, features, label_name) + + return self._handle_padding(labels) + + def _handle_no_padding(self, labels, features, label_name): + """处理没有填充的标签""" + if isinstance(features[0][label_name], list): + return list(labels) + + return [np.concatenate([label, []]) for label in labels] + + def _handle_padding(self, labels): + """处理需要填充的标签""" + max_label_length = self._get_max_label_length(labels) + return [ + self._pad_label(label, max_label_length) for label in labels + ] + + def _get_max_label_length(self, labels): + """获取最大标签长度""" + max_padding = self.padding == PaddingStrategy.MAX_LENGTH and self.max_length is not None + if max_padding: + return self.max_length + return max(len(l) for l in labels) + + def _pad_label(self, label, max_label_length): + """对标签进行填充""" + padding_side = self.tokenizer.padding_side + pad_length = max_label_length - len(label) + padding = [self.label_pad_token_id] * pad_length + + if padding_side == "right": + return label + padding + + return padding + label + + def _convert_labels_to_tensor(self, batch, return_tensors): + """根据指定的返回类型转换 labels 为 tensor""" + if batch.get("labels") is not None: + if return_tensors == "pt": + import torch + batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64) + elif return_tensors == "tf": + import tensorflow as tf + batch["labels"] = tf.constant(batch["labels"], dtype=tf.int64) + else: + batch["labels"] = ms.tensor(batch["labels"], dtype=ms.int64) + else: + batch["labels"] = None + return batch + + def _requires_decoder_input_ids(self, labels): + """检查是否需要生成 decoder_input_ids""" + return ( + labels is not None and + self.model is not None and + hasattr(self.model, "prepare_decoder_input_ids_from_labels") + ) diff --git a/examples/privacy/embedding_inversion/vec2text/trainers/__init__.py b/examples/privacy/embedding_inversion/vec2text/trainers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a804305cd09c8c7edb4babe1932c2c9806506b4c --- /dev/null +++ b/examples/privacy/embedding_inversion/vec2text/trainers/__init__.py @@ -0,0 +1,6 @@ +''' +init +''' +# pylint: disable=E0001 +from .corrector import Corrector # noqa: F401 +from .inversion import InversionTrainer # noqa: F401 diff --git a/examples/privacy/embedding_inversion/vec2text/trainers/base.py b/examples/privacy/embedding_inversion/vec2text/trainers/base.py new file mode 100644 index 0000000000000000000000000000000000000000..b740e6974ca2be52416a18fbe060f90fb4a31b52 --- /dev/null +++ b/examples/privacy/embedding_inversion/vec2text/trainers/base.py @@ -0,0 +1,489 @@ +''' +base trainer +''' +import collections +import copy +import logging +import random +from typing import Callable, Dict, List, Tuple, Union + +import evaluate +import nltk +import numpy as np +import scipy.stats +import tqdm +from mindnlp.engine import Trainer, EvalLoopOutput +import mindspore as ms +import mindspore.ops as ops + +logger = logging.getLogger(__name__) + +# pylint: disable=W0612 +DEFAULT_INPUT_STRING = ("Twas brillig, and the slithy toves, Did gyre and gimble in the wabe," + "All mimsy were the borogoves, And the mome raths outgrabe.") + +# pylint: disable=W0613 +def preprocess_logits_for_metrics(logits, labels): + if isinstance(logits, tuple): + # Depending on the model and config, logits may contain extra tensors, + # like past_key_values, but logits always come first + logits = logits[0] + return logits.argmax(axis=-1) + + +def sem(l: List[float]) -> float: + result = scipy.stats.sem(np.array(l)) + if isinstance(result, np.ndarray): + return result.mean().item() + return result + + +def mean(l: Union[List[int], List[float]]) -> float: + return sum(l) / len(l) + + +def count_overlapping_ngrams(s1: str, s2: str, n: int) -> int: + ngrams_1 = nltk.ngrams(s1, n) + ngrams_2 = nltk.ngrams(s2, n) + ngram_counts_1 = collections.Counter(ngrams_1) + ngram_counts_2 = collections.Counter(ngrams_2) + total = 0 + for ngram, count in ngram_counts_1.items(): + total += min(count, ngram_counts_2[ngram]) + return total + + +class BaseTrainer(Trainer): + + '''BaseTrainer''' + + additional_metrics: List[Callable[..., Dict[str, float]]] + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.preprocess_logits_for_metrics = preprocess_logits_for_metrics + self.compute_metrics = self.compute_metrics_func + self.metric_accuracy = evaluate.load("accuracy") + self.metric_bleu = evaluate.load("sacrebleu") + self.metric_rouge = evaluate.load("rouge") + self.additional_metrics = [] + + self.gen_kwargs = { + "early_stopping": False, + "num_beams": 1, + "do_sample": False, + "no_repeat_ngram_size": 0, + } + @property + def pad_token_id(self) -> int: + try: + return self.model.encoder_decoder.config.pad_token_id + except AttributeError: + return self.tokenizer.pad_token_id + + @property + def bos_token_id(self) -> int: + try: + return self.model.encoder_decoder.decoder_start_token_id + except AttributeError: + return self.tokenizer.bos_token_id + + def sanity_decode(self, input_string: str = None, max_length: int = 128): + """Encodes and decodes a string as a sanity check.""" + if input_string is None: + input_string = DEFAULT_INPUT_STRING + self.model.eval() + print("=" * 16, "Begin trainer sanity check", "=" * 16) + print("\tInput to encode ->", input_string) + inputs = self.embedder_tokenizer( + input_string, + return_tensors="ms", + max_length=max_length, + padding="max_length", + ) + inputs = inputs + gen_kwargs = copy.copy(self.gen_kwargs) + gen_kwargs["min_length"] = 1 + gen_kwargs["max_length"] = max_length + print("max_length:", gen_kwargs["max_length"]) + regenerated = self.generate( + inputs={ + "embedder_input_ids": inputs["input_ids"], + "embedder_attention_mask": inputs["attention_mask"], + }, + generation_kwargs=gen_kwargs, + ) + print("\tDecoded output shape -> ", regenerated.shape) + output_string = self.tokenizer.decode( + regenerated.flatten(), skip_special_tokens=True + ) + print("\tDecoded output ->", output_string) + print("=" * 16, "End trainer sanity check", "=" * 16) + + def _log_preds_table(self, table_key: str, decoded_preds: List[str], decoded_labels: List[str]): + ''' + _log_preds_table + ''' + if not self.args.use_wandb: + return + + if not self.args.local_rank <= 0: + return + + num_rows = 50 + idxs = random.choices( + range(len(decoded_preds)), k=min(len(decoded_preds), num_rows) + ) + + data = [] + for idx in idxs: + data.append([decoded_labels[idx], decoded_preds[idx]]) + + + def _get_decoded_sequences(self, dataset, n: int) -> Tuple[List[ms.Tensor], List[ms.Tensor]]: + """Iterates through eval dataset and does decoding. + + TODO: do this better. We shouldn't need to iterate through eval set twice + but I don't want to copy 1000 lines of code to change their eval loop... + + Probably want custom eval eventually. Also this depends on eval data being + in the same order which is annoying. + """ + assert not self.model.training + + gen_kwargs = copy.copy(self.gen_kwargs) + all_preds = [] + all_labels = [] + for _, inputs in enumerate(tqdm.tqdm(dataset, desc="generating from val", leave=False)): + # https://huggingface.co/docs/transformers/v4.26.1/en/main_classes/text_generation#transformers.GenerationMixin.generate + # inputs_cuda = {k: v.to(self.args.device) for k, v in inputs.items()} + max_length = self.model.config.max_seq_length + gen_kwargs["max_length"] = max_length + self.model.set_train(False) + + inputs_one_col_dict = { + "input_ids": inputs[0], + "attention_mask": inputs[1], + "labels": inputs[2], + "length": inputs[3], + "embedder_input_ids": inputs[4], + "embedder_attention_mask": inputs[5] + + + } + generated_text = self.generate(inputs=inputs_one_col_dict, generation_kwargs=gen_kwargs) + self.model.set_train(True) + if generated_text.shape[1] < max_length: + # Pad generated text to max length + pad_tokens = ( + ops.ones( + (generated_text.shape[0], max_length - generated_text.shape[1]), + dtype=ms.int64 + ) + * self.pad_token_id + ) + generated_text = ops.cat((generated_text, pad_tokens), axis=1) + + # true_input_ids = inputs["input_ids"] + true_input_ids = inputs[0] + if true_input_ids.shape[1] < max_length: + # Pad true text to max length + # Pad generated text to max length + pad_tokens = ( + ops.ones( + (true_input_ids.shape[0], max_length - true_input_ids.shape[1]), + dtype=ms.int64 + ) + * self.pad_token_id + ) + true_input_ids = ops.cat((true_input_ids, pad_tokens), axis=1) + + all_preds.extend(generated_text.asnumpy().tolist()) + all_labels.extend(true_input_ids.asnumpy().tolist()) + if len(all_preds) >= n: + break + return all_preds, all_labels + + def _compute_data_metrics(self, inputs: Dict[str, ms.Tensor]) -> Dict[str, float]: + '''compute_data_metrics''' + inputs_pad_tokens = ( + (inputs["input_ids"] == self.tokenizer.pad_token_id) + .sum(axis=1) + .float() + .mean() + .item() + ) + embedder_inputs_pad_tokens = ( + (inputs["embedder_input_ids"] == self.embedder_tokenizer.pad_token_id) + .sum(axis=1) + .float() + .mean() + .item() + ) + + inputs_non_pad_tokens = inputs["input_ids"].shape[1] - inputs_pad_tokens + embedder_inputs_non_pad_tokens = ( + inputs["input_ids"].shape[1] - embedder_inputs_pad_tokens + ) + + return { + "encoder_decoder_inputs_pad_tokens": inputs_pad_tokens, + "encoder_decoder_inputs_non_pad_tokens": inputs_non_pad_tokens, + "embedder_inputs_pad_tokens": embedder_inputs_pad_tokens, + "embedder_inputs_non_pad_tokens": embedder_inputs_non_pad_tokens, + } + + def compute_metrics_func(self, eval_preds): + ''' + compute_metrics_func + ''' + preds = eval_preds.predictions + labels = eval_preds.label_ids + + assert labels, "got empty labels for eval" + assert ( + ms.tensor(preds).shape == ms.tensor(labels).shape + ), f"preds.shape {preds.shape} / labels.shape {labels.shape}" + + # preds have the same shape as the labels. + labels = labels.reshape(-1) + preds = preds.reshape(-1) + accuracy_result = self.metric_accuracy.compute( + predictions=preds, references=labels + ) + + return {**accuracy_result} + + def _text_comparison_metrics(self, predictions_ids, predictions_str, references_ids, references_str): + '''text_comparison_metrics''' + assert len(predictions_ids) == len(references_ids) + assert len(predictions_ids) == len(predictions_str) + assert len(predictions_str) == len(references_str) + num_preds = len(predictions_ids) + if not num_preds: + return {} + + + + # Compute token, precision, recall, and ngram-level metrics. + precision_sum = 0.0 + recall_sum = 0.0 + num_overlapping_words = [] + num_overlapping_bigrams = [] + num_overlapping_trigrams = [] + num_true_words = [] + num_pred_words = [] + f1s = [] + for i in range(num_preds): + true_words = nltk.tokenize.word_tokenize(references_str[i]) + pred_words = nltk.tokenize.word_tokenize(predictions_str[i]) + num_true_words.append(len(true_words)) + num_pred_words.append(len(pred_words)) + + true_words_set = set(true_words) + pred_words_set = set(pred_words) + tp = len(true_words_set & pred_words_set) + fp = len(true_words_set) - len(true_words_set & pred_words_set) + fn = len(pred_words_set) - len(true_words_set & pred_words_set) + + precision = (tp) / (tp + fp + 1e-20) + recall = (tp) / (tp + fn + 1e-20) + + try: + f1 = (2 * precision * recall) / (precision + recall + 1e-20) + except ZeroDivisionError: + f1 = 0.0 + f1s.append(f1) + + precision_sum += precision + recall_sum += recall + + ############################################################ + num_overlapping_words.append( + count_overlapping_ngrams(true_words, pred_words, 1) + ) + num_overlapping_bigrams.append( + count_overlapping_ngrams(true_words, pred_words, 2) + ) + num_overlapping_trigrams.append( + count_overlapping_ngrams(true_words, pred_words, 3) + ) + + set_token_metrics = { + "token_set_precision": (precision_sum / num_preds), + "token_set_recall": (recall_sum / num_preds), + "token_set_f1": mean(f1s), + # "token_set_f1_sem": sem(f1s), + # "n_ngrams_match_1": mean(num_overlapping_words), + # "n_ngrams_match_2": mean(num_overlapping_bigrams), + # "n_ngrams_match_3": mean(num_overlapping_trigrams), + # "num_true_words": mean(num_true_words), + # "num_pred_words": mean(num_pred_words), + } + ############################################################ + bleu_results = np.array( + [ + self.metric_bleu.compute(predictions=[p], references=[r])["score"] + for p, r in zip(predictions_str, references_str) + ] + ) + #rouge_result = self.metric_rouge.compute( + #predictions=predictions_str, references=references_str + #) + self.bleu_results = ( + bleu_results.tolist() + ) # store bleu results in case we want to use them later for t-tests + # bertscore_result = self.metric_bertscore.compute( + # predictions=predictions_str, references=references_str, lang="en" + # ) + exact_matches = np.array(predictions_str) == np.array(references_str) + gen_metrics = { + "bleu_score": bleu_results.mean(), + # "bleu_score_sem": sem(bleu_results), + # "rouge_score": rouge_result[ + # "rouge1" + # ], # ['rouge1', 'rouge2', 'rougeL', 'rougeLsum'] + # "bert_score": statistics.fmean(bertscore_result["f1"]), + "exact_match": mean(exact_matches), + # "exact_match_sem": sem(exact_matches), + } + + all_metrics = {**set_token_metrics, **gen_metrics} + for metric in self.additional_metrics: + all_metrics.update(metric(references_str, predictions_str)) + + return all_metrics + # pylint: disable=R0915 + # pylint: disable=W0212 + def eval_generation_metrics(self, dataset) -> Dict[str, float]: + ''' + eval_generation_metrics + ''' + # Get decoded text. Note that this is different than `preds`, which + # is used to compute the loss. + preds_sample_list, preds_sample_labels_list = self._get_decoded_sequences( + dataset, n=10000 + ) + decoded_preds = self.tokenizer.batch_decode( + preds_sample_list, skip_special_tokens=True + ) + decoded_labels = self.tokenizer.batch_decode( + preds_sample_labels_list, skip_special_tokens=True + ) + bleu_result = self._text_comparison_metrics( + predictions_ids=preds_sample_list, + predictions_str=decoded_preds, + references_ids=preds_sample_labels_list, + references_str=decoded_labels, + ) + #pylint: disable=W0613 + self._log_preds_table( + table_key="val_text_preds", + decoded_preds=decoded_preds, + decoded_labels=decoded_labels, + ) + + if not decoded_preds: + return {} + print("[pred]", decoded_preds[3]) + print("[true]", decoded_labels[3]) + print("\n\n") + print("[pred]", decoded_preds[1]) + print("[true]", decoded_labels[1]) + print("\n\n") + print("[pred]", decoded_preds[2]) + print("[true]", decoded_labels[2]) + print("\n\n") + + # Compute sims of eval data using embedder. + preds_sample = ms.tensor(preds_sample_list)[:128] + preds_sample_labels = ms.tensor( + preds_sample_labels_list + )[:128] + + # Log num tokens. + num_tokens_metrics = { + "pred_num_tokens": ( + (preds_sample != self.pad_token_id) + & (preds_sample != self.bos_token_id)).sum(1).float().mean().item(), + "true_num_tokens": ( + (preds_sample_labels != self.pad_token_id) + & (preds_sample_labels != self.bos_token_id) + ).sum(1).float().mean().item(),} + + eos_token_id = self.embedder_tokenizer.eos_token_id + if eos_token_id is not None: + eos_tokens = ( + ops.ones( + (len(preds_sample), 1), + dtype=ms.int64 + ) + * eos_token_id + ) + preds_sample = ops.cat((preds_sample[:, 1:], eos_tokens), axis=1) + + try: + self.model.set_train(False) + # self.inversion_trainer.model.noise_level = 0.0 + preds_sample_retokenized = self.embedder_tokenizer( + decoded_preds, + padding=True, + truncation=False, + return_tensors="ms", + )["input_ids"] + preds_sample_retokenized = preds_sample_retokenized[ + : self.args.per_device_eval_batch_size, : + ] + pad_token_id = self.pad_token_id + preds_emb = self.call_embedding_model( + input_ids=preds_sample_retokenized, + attention_mask=(preds_sample_retokenized != pad_token_id), + ) + preds_sample_labels_retokenized = self.embedder_tokenizer( + decoded_labels, padding=True, truncation=False, return_tensors="ms" + )["input_ids"] + preds_sample_labels_retokenized = preds_sample_labels_retokenized[ + : self.args.per_device_eval_batch_size, : + ] + labels_emb = self.call_embedding_model( + input_ids=preds_sample_labels_retokenized, + attention_mask=(preds_sample_labels_retokenized != pad_token_id), + ) + emb_cos_sims = ops.cosine_similarity(preds_emb, labels_emb) + + sim_result = { + "emb_cos_sim": emb_cos_sims.mean().item(), + } + self.model.set_train(True) + + except (TypeError, RuntimeError): + sim_result = {"emb_cos_sim": 0, "emb_cos_sim_sem": 0} + + self.preds_sample_list = preds_sample_list + self.preds_sample_labels_list = preds_sample_labels_list + + metrics = {**num_tokens_metrics, **bleu_result, **sim_result} + return metrics + + def evaluation_loop(self, dataset, *args, **kwargs) -> EvalLoopOutput: + + """ + Run evaluation and returns metrics. + + Override to compute ppl from eval loss. + """ + + output = super().evaluation_loop(dataset, *args, **kwargs) + # metric_key_prefix = kwargs["metric_key_prefix"] + # # TODO compute some data metrics here too. + if self.args.local_rank <= 0: + # Generate some text on worker 0 and compute metrics. + generation_metrics = self.eval_generation_metrics(dataset) + output.metrics.update(generation_metrics) + return output + + #TODO: lack load checkpoint func + + def _remap_state_dict(self, state_dict: Dict) -> Dict: + """Edit keys posthumously on model load.""" + return state_dict diff --git a/examples/privacy/embedding_inversion/vec2text/trainers/corrector.py b/examples/privacy/embedding_inversion/vec2text/trainers/corrector.py new file mode 100644 index 0000000000000000000000000000000000000000..c15816132edb548aa94616deaa9401781835c49d --- /dev/null +++ b/examples/privacy/embedding_inversion/vec2text/trainers/corrector.py @@ -0,0 +1,761 @@ +''' +utilize inversion model to iterablely correct result to get better result +''' +import functools +import logging +import os +from typing import Any, Dict, List, Optional, Tuple, Union + +import datasets +import mindspore as ms +import mindspore.ops as ops +from mindnlp.engine import EvalLoopOutput + +from models import CorrectorEncoderModel +from models.model_utils import freeze_params +from run_args import TrainingArguments +from utils import dataset_map_single_worker +from trainers.base import BaseTrainer +from trainers.inversion import InversionTrainer + +# pylint: disable=unused-variable +# pylint: disable=unused-argument + +logger = logging.getLogger(__name__) +class Corrector(BaseTrainer): + """Trains an encoder model to generate embeddings that recursively correct of an + InversionTrainer. + """ + + train_dataset: datasets.Dataset + eval_dataset: Dict[str, datasets.Dataset] + # TODO: don't assume that the encoder has to have the same tokenizer as the encoder_decoder + # or embedder model. + + _hypothesis_cache: Dict[str, Tuple[ms.Tensor, ms.Tensor, ms.Tensor]] + + # If set, only take hypothesis if it improves our distance to ground-truth. + return_best_hypothesis: bool = False + + # Initialize from this hypothesis, if set + initial_hypothesis_str: Optional[str] = None + + def __init__(self, + model: CorrectorEncoderModel, + inversion_trainer: InversionTrainer, + args: Optional[TrainingArguments], + **kwargs): + # Freeze other model params + freeze_params(inversion_trainer.model) + # We're training this corrector model to correct outputs from + # a model trained & loaded via the inversion trainer. + self.inversion_trainer = inversion_trainer + self.inversion_trainer.model.use_frozen_embeddings_as_input = True + super().__init__( + model=model, + args=args, + train_dataset=self.inversion_trainer.train_dataset, + eval_dataset=self.inversion_trainer.eval_dataset, + **kwargs, + ) + self.tokenizer = self.inversion_trainer.model.tokenizer + self.embedder_tokenizer = self.inversion_trainer.model.embedder_tokenizer + self.embedder = self.inversion_trainer.embedder + self.call_embedding_model = self.inversion_trainer.model.call_embedding_model + # self.train_dataset = self.inversion_trainer.train_dataset, + # self.eval_dataset = self.inversion_trainer.eval_dataset, + self.initial_hypothesis_str = None + + # Number of steps of self-correction + self.num_gen_recursive_steps = 1 + self.sequence_beam_width = 1 + + # If set, return closest (in embedding space) hypothesis we see during generation + self.return_best_hypothesis = False + + # Need to train with same device as the inversion model to avoid weird errors. + assert self.args.fp16 == self.inversion_trainer.args.fp16 + assert self.args.bf16 == self.inversion_trainer.args.bf16 + + # pylint: disable=W0221 + def evaluation_loop(self, dataloader, *args, **kwargs) -> EvalLoopOutput: + """ + Run evaluation and returns metrics. + + Override to compute ppl from eval loss. + """ + # self.inversion_trainer.model + metric_key_prefix = kwargs["metric_key_prefix"] + output = super().evaluation_loop(dataloader, *args, **kwargs) # type: ignore + if metric_key_prefix in {"eval_msmarco", "eval_nq"}: + n_rounds = 5 + self.num_gen_recursive_steps = n_rounds + multi_round_generation_metrics = self.eval_generation_metrics( + self.inversion_trainer.eval_dataset + ) + multiround_generation_metrics = { + f"{metric_key_prefix}_{n_rounds}round_{k}": v + for k, v in multi_round_generation_metrics.items() + } + output.metrics.update(multiround_generation_metrics) + self.num_gen_recursive_steps = 1 + + # self.inversion_trainer.model.cpu() #error!!! + + return output + + def _precompute_hypothesis_and_embedding(self, ds_inputs: Dict[str, ms.Tensor], collator=None,): + '''precompute_hypothesis_and_embedding''' + assert not self.model.training + inputs = collator.tokenizer.pad( + {k: v for k, v in ds_inputs.items() if k != "labels"}, + padding=collator.padding, + max_length=collator.max_length, + pad_to_multiple_of=collator.pad_to_multiple_of, + return_tensors=collator.return_tensors, + ) + + ( + frozen_embeddings, + hypothesis_input_ids, + hypothesis_attention_mask, + hypothesis_embedding, + ) = self._get_hypothesis_uncached(inputs=inputs) + ds_inputs["frozen_embeddings"] = frozen_embeddings.cpu() + ds_inputs["hypothesis_embedding"] = hypothesis_embedding.cpu() + + # cut padding so we can batch by length later + ds_inputs["hypothesis_input_ids"] = [] + ds_inputs["hypothesis_attention_mask"] = [] + #.cpu() is pytorch function, prepare to change in the corrector phase. + for input_ids, attention_mask in zip(hypothesis_input_ids.cpu(), hypothesis_attention_mask.cpu()): + num_tokens = attention_mask.sum() + ds_inputs["hypothesis_input_ids"].append(input_ids[: num_tokens + 1]) + ds_inputs["hypothesis_attention_mask"].append( + attention_mask[: num_tokens + 1] + ) + print("input_ids[0]:", self.tokenizer.decode(ds_inputs["input_ids"][0])) + print( + "hypothesis_input_ids[0]:", + self.tokenizer.decode(ds_inputs["hypothesis_input_ids"][0]), + ) + return ds_inputs + + def _preprocess_dataset_hypotheses(self, dataset: datasets.Dataset, filter_correct_examples: bool = False): + + ''' + In each model directory, we store a copy of the dataset with hypotheses + generated by the model that's checkpointed in this directory. This + won't scale well, but hopefully we don't do this with too many models, + and precomputing 5M hypotheses on A100 takes ~8 hours, so they're worth + storing. + + Note that the dataset fingerprint changes with calls to select() + so we won't overwrite the big dataset files when we use tiny subsets + during testing. + cache_dir = os.environ["VEC2TEXT_CACHE"] + ''' + cache_dir = os.environ.get( + "VEC2TEXT_CACHE", os.path.expanduser("~/.cache/inversion") + ) + assert os.path.exists(cache_dir) + + # pylint: disable=W0212 + cache_path = os.path.join(cache_dir, f"{dataset._fingerprint}_hypotheses.cache") + + if not os.path.exists(cache_path): + print(f"\t[{dataset.builder_name}] Saving hypotheses to path {cache_path}") + + dataset = dataset_map_single_worker( + dataset=dataset, + map_fn=functools.partial( + self._precompute_hypothesis_and_embedding, + collator=self.data_collator, + ), + batched=True, + batch_size=(self.args.train_batch_size * 2), + desc="Precomputing hypotheses for data", + num_proc=None + + ) + + if filter_correct_examples: + old_length = len(dataset) + + def embedding_is_not_correct(ex): + return ( + ~ops.isclose( + ex["frozen_embeddings"], + ex["hypothesis_embedding"], + ).all(axis=1) + ).tolist() + + dataset = dataset.filter( + embedding_is_not_correct, + batched=True, + batch_size=1024, + ) + print(f"filtered {old_length} datapoints to {len(dataset)}") + dataset.save_to_disk(cache_path) + else: + logging.info("Loading hypotheses from path %s", cache_path) + print( + f"\t[{dataset.builder_name}] Loading hypotheses from path {cache_path}" + ) + dataset = datasets.load_from_disk(cache_path) + return dataset, cache_path + + def precompute_hypotheses(self) -> None: + """Generates and embeds hypotheses using `self.inversion_trainer`. + + Returns path to precomputed-and-saved train dataset, which is sometimes + useful for outside processes. + """ + logger.info("Precomputing frozen embedding & hypotheses before training") + + self.train_dataset, _ = self._preprocess_dataset_hypotheses( + dataset=self.train_dataset, filter_correct_examples=True + ) + for k, v in self.eval_dataset.items(): + self.eval_dataset[k], _ = self._preprocess_dataset_hypotheses( + dataset=v, filter_correct_examples=False + ) + + def _inner_training_loop(self, *args, **kwargs): + '''inner training loop''' + + # Don't let tokenizers run in parallel mode. + # os.environ["TOKENIZERS_PARALLELISM"] = "False" + + self.model.eval() + # self.model.to(self.args.device) + #self.inversion_trainer.model + #self.precompute_hypotheses() + self.model.train() + # self.inversion_trainer.model.cpu() + return super()._inner_training_loop(*args, **kwargs) + + def generate(self, inputs: Dict, generation_kwargs: Dict, num_recursive_steps: int = None, + sequence_beam_width: int = None,) -> ms.Tensor: + """Generates text using self-correction. + + Args: + inputs (Dict[str, ms.Tensor]): inputs for generation, like the input embedding, hypothesis, + and hypothesis embedding + generation_kwargs (Dict): dictionary of parameters for generation, will be passed on to the model + sequence_beam_width (int): beam width for sequence-level beam search + Returns: + generated_ids (ms.Tensor): ids of generated text + """ + + try: + frozen_embeddings = inputs["frozen_embeddings"] + hypothesis_input_ids = inputs["hypothesis_input_ids"] + hypothesis_attention_mask = inputs["hypothesis_attention_mask"] + hypothesis_embedding = inputs["hypothesis_embedding"] + except KeyError: + ( + frozen_embeddings, + hypothesis_input_ids, + hypothesis_attention_mask, + hypothesis_embedding, + ) = self._get_hypothesis_uncached(inputs=inputs) + + # Add beam dimension: + # (batch, ...) -> (batch, beam, ...) + inputs["frozen_embeddings"] = frozen_embeddings + inputs["hypothesis_input_ids"] = hypothesis_input_ids + inputs["hypothesis_attention_mask"] = hypothesis_attention_mask + inputs["hypothesis_embedding"] = hypothesis_embedding + # print("generating with sequence_beam_width:", (sequence_beam_width or self.sequence_beam_width)) + + num_recursive_steps = num_recursive_steps or self.num_gen_recursive_steps + sequence_beam_width = sequence_beam_width or self.sequence_beam_width + num_recursive_steps_so_far = 0 + + total_best_scores_seen = None # Track best scores for early stopping + + while num_recursive_steps >= 1: + gen_text_ids, hypothesis_embedding, best_scores = self._generate_with_beam( + inputs=inputs, + generation_kwargs=generation_kwargs, + num_recursive_steps=num_recursive_steps, + num_recursive_steps_so_far=num_recursive_steps_so_far, + sequence_beam_width=sequence_beam_width, + ) + inputs["hypothesis_input_ids"] = gen_text_ids + inputs["hypothesis_attention_mask"] = ( + gen_text_ids != self.model.encoder_decoder.config.pad_token_id + ).int() + inputs["hypothesis_embedding"] = hypothesis_embedding + # step counters + num_recursive_steps -= 1 + num_recursive_steps_so_far += 1 + # early stopping + if best_scores is not None: + if (total_best_scores_seen is not None) and ops.isclose(best_scores, total_best_scores_seen, atol=1e-3): + print( + "scores stopped increasing! stopping early after", + num_recursive_steps_so_far, + "steps", + ) + break + best_scores = total_best_scores_seen + + return gen_text_ids + + def generate_with_hypotheses(self, inputs: Dict, generation_kwargs: Dict, num_recursive_steps: int = None, + sequence_beam_width: int = None,) -> Tuple[ms.Tensor, ms.Tensor]: + """Generates text using self-correction. Works exactly like generate(), but returns all the intermediate hypotheses steps. + + Args: + inputs (Dict[str, ms.Tensor]): inputs for generation, like the input embedding, hypothesis, + and hypothesis embedding + generation_kwargs (Dict): dictionary of parameters for generation, will be passed on to the model + sequence_beam_width (int): beam width for sequence-level beam search + Returns: + generated_ids (List[ms.Tensor]): ids of generated text, for each hypothesis sequence + hypothesis_embeddings (List[ms.Tensor]): embeddings of each hypothesis sequence + """ + try: + frozen_embeddings = inputs["frozen_embeddings"] + hypothesis_input_ids = inputs["hypothesis_input_ids"] + hypothesis_attention_mask = inputs["hypothesis_attention_mask"] + hypothesis_embedding = inputs["hypothesis_embedding"] + except KeyError: + ( + frozen_embeddings, + hypothesis_input_ids, + hypothesis_attention_mask, + hypothesis_embedding, + ) = self._get_hypothesis_uncached(inputs=inputs) + + # Add beam dimension: + # (batch, ...) -> (batch, beam, ...) + inputs["frozen_embeddings"] = frozen_embeddings + inputs["hypothesis_input_ids"] = hypothesis_input_ids + inputs["hypothesis_attention_mask"] = hypothesis_attention_mask + inputs["hypothesis_embedding"] = hypothesis_embedding + + num_recursive_steps = num_recursive_steps or self.num_gen_recursive_steps + sequence_beam_width = sequence_beam_width or self.sequence_beam_width + num_recursive_steps_so_far = 0 + + total_best_scores_seen = None # Track best scores for early stopping + + ground_truth_embedding = inputs["hypothesis_embedding"] + hypothesis_embeddings = [ground_truth_embedding] # Track hypothesis embeddings + + hypothesis_ids = [inputs["hypothesis_input_ids"]] # Track hypothesis ids + + while num_recursive_steps >= 1: + gen_text_ids, hypothesis_embedding, best_scores = self._generate_with_beam( + inputs=inputs, + generation_kwargs=generation_kwargs, + num_recursive_steps=num_recursive_steps, + num_recursive_steps_so_far=num_recursive_steps_so_far, + sequence_beam_width=sequence_beam_width, + ) + inputs["hypothesis_input_ids"] = gen_text_ids + inputs["hypothesis_attention_mask"] = ( + gen_text_ids != self.model.encoder_decoder.config.pad_token_id + ).int() + inputs["hypothesis_embedding"] = hypothesis_embedding + # step counters + num_recursive_steps -= 1 + num_recursive_steps_so_far += 1 + # early stopping + + if best_scores is not None: + closest_idx = ops.argmax(best_scores) + if (total_best_scores_seen is not None) and ops.isclose(best_scores, total_best_scores_seen, atol=1e-3): + print( + "scores stopped increasing! stopping early after", + num_recursive_steps_so_far, + "steps", + ) + break + best_scores = total_best_scores_seen + else: + closest_idx = 0 + + hypothesis_embeddings.append(hypothesis_embedding[closest_idx].unsqueeze(0)) + hypothesis_ids.append(gen_text_ids[closest_idx].unsqueeze(0)) + + return hypothesis_ids, hypothesis_embeddings + + + def _generate_with_beam(self, inputs, generation_kwargs, + num_recursive_steps, num_recursive_steps_so_far, sequence_beam_width): + ''' + _generate_with_beam是原来的NLOC==190的函数,拆分成以下多个函数 + 注释为no test for corrector的所有的函数就是为了实现这个模块 + ''' + assert num_recursive_steps >= 1 + frozen_embeddings = inputs["frozen_embeddings"] + + # 准备生成参数 + self._prepare_generation_kwargs(generation_kwargs, sequence_beam_width) + + # 生成初始假设文本 + if num_recursive_steps_so_far == 0 and self.initial_hypothesis_str: + gen_text_ids = self._generate_initial_hypothesis(inputs, frozen_embeddings) + else: + # 调用模型生成文本 + gen_text_ids, transition_scores = self._generate_text(inputs, generation_kwargs) + + # 嵌入生成的假设文本 + hypothesis_embedding = self.embed_generated_hypothesis(input_ids=gen_text_ids) + + # 获取批次大小 + batch_size = self._get_batch_size(frozen_embeddings, sequence_beam_width, num_recursive_steps_so_far) + + # 执行 Beam Search + best_scores = None + if gen_text_ids.shape[0] > batch_size: + gen_text_ids, hypothesis_embedding, best_scores = self._perform_beam_search( + inputs, + gen_text_ids, + hypothesis_embedding, + batch_size, + sequence_beam_width, + num_recursive_steps, + transition_scores + ) + + + # 确保嵌入的维度与冻结嵌入一致 + assert hypothesis_embedding.shape[-1] == inputs["frozen_embeddings"].shape[-1] + return gen_text_ids, hypothesis_embedding, best_scores + + def _prepare_generation_kwargs(self, generation_kwargs, sequence_beam_width): + '''no test for corrector''' + if not generation_kwargs["do_sample"]: + num_return_sequences = max(sequence_beam_width, generation_kwargs.get("num_beams", 1)) + generation_kwargs["num_beams"] = num_return_sequences + generation_kwargs["num_return_sequences"] = num_return_sequences + + def _generate_initial_hypothesis(self, inputs, frozen_embeddings): + '''no test for corrector''' + batch_size = frozen_embeddings.shape[0] + gen_text_ids = self.embedder_tokenizer( + [self.initial_hypothesis_str], + return_tensors="ms", + max_length=inputs["hypothesis_input_ids"].shape[1], + truncation=True, + padding="max_length", + )["input_ids"].repeat((batch_size, 1)) + + bos_token_id = self.model.encoder_decoder.config.decoder_start_token_id + bos_token_ids = ms.ops.ones((batch_size, 1), dtype=ms.int64) * bos_token_id + return ms.ops.cat((bos_token_ids, gen_text_ids[:, :-1]), axis=1) + + def _generate_text(self, inputs, generation_kwargs): + '''no test for corrector''' + outputs = self.model.generate( + inputs=inputs, + generation_kwargs=generation_kwargs, + return_dict_in_generate=True, + ) + gen_text_ids = outputs.sequences + transition_scores = self.model.encoder_decoder.compute_transition_scores( + outputs.sequences, + outputs.scores, + normalize_logits=True + ) + return gen_text_ids, transition_scores + + def _get_batch_size(self, frozen_embeddings, sequence_beam_width, num_recursive_steps_so_far): + '''no test for corrector''' + if num_recursive_steps_so_far == 0: + return frozen_embeddings.shape[0] + return int(frozen_embeddings.shape[0] / sequence_beam_width) + def _perform_beam_search(self, inputs, gen_text_ids, hypothesis_embedding, + batch_size, sequence_beam_width, num_recursive_steps, transition_scores): + '''no test for corrector''' + if sequence_beam_width == 1: + gen_text_ids, hypothesis_embedding = self._beam_search_regular( + gen_text_ids, hypothesis_embedding, inputs, batch_size, transition_scores + ) + elif num_recursive_steps == 1: + gen_text_ids, hypothesis_embedding = self._beam_search_base_case( + gen_text_ids, hypothesis_embedding, inputs, batch_size, transition_scores + ) + else: + gen_text_ids, hypothesis_embedding = self._beam_search_top_k( + gen_text_ids, hypothesis_embedding, + inputs, + batch_size, sequence_beam_width, + num_recursive_steps, + transition_scores + ) + + return gen_text_ids, hypothesis_embedding, transition_scores.max(1).values.cpu() + + def _beam_search_regular(self, gen_text_ids, + hypothesis_embedding, inputs, batch_size, transition_scores): + '''no test for corrector''' + beam_width = int(gen_text_ids.shape[0] / batch_size) + distances_per_beam = ms.ops.CosineSimilarity(dim=2)( + hypothesis_embedding.reshape((batch_size, beam_width, -1)), + inputs["frozen_embeddings"][:, None, :] + ) + + scores = transition_scores.reshape((batch_size, beam_width)) + best_idx_in_beam = ms.ops.Argmax()(scores, axis=1) + + #hypothesis_embedding = hypothesis_embedding.reshape((batch_size, beam_width, -1))[ms.ops.arange(batch_size), best_idx_in_beam] + reshaped_embedding = hypothesis_embedding.reshape((batch_size, beam_width, -1)) + batch_indices = ms.ops.arange(batch_size) + hypothesis_embedding = reshaped_embedding[batch_indices, best_idx_in_beam] + + gen_text_ids = gen_text_ids.reshape((batch_size, beam_width, -1))[ms.ops.arange(batch_size), best_idx_in_beam] + + return gen_text_ids, hypothesis_embedding + + def _beam_search_base_case(self, gen_text_ids, + hypothesis_embedding, inputs, batch_size, transition_scores): + '''no test for corrector''' + beam_width = int(gen_text_ids.shape[0] / batch_size) + frozen_embeddings_per_beam = inputs["frozen_embeddings"][:, None, :].repeat((1, beam_width, 1)) + + distances_per_beam = ms.ops.CosineSimilarity(dim=2)( + hypothesis_embedding.reshape((batch_size, beam_width, -1)), + frozen_embeddings_per_beam + ) + + scores = transition_scores.reshape((batch_size, beam_width)) + best_idx_in_beam = ms.ops.Argmax()(scores, axis=1) + + reshaped_hypothesis_embedding = hypothesis_embedding.reshape((batch_size, beam_width, -1)) + + hypothesis_embedding = reshaped_hypothesis_embedding[ms.ops.arange(batch_size), best_idx_in_beam] + + gen_text_ids = gen_text_ids.reshape((batch_size, beam_width, -1))[ms.ops.arange(batch_size), best_idx_in_beam] + + return gen_text_ids, hypothesis_embedding + + def _beam_search_top_k(self, gen_text_ids, hypothesis_embedding, + inputs, batch_size, sequence_beam_width, num_recursive_steps, transition_scores): + '''no test for corrector''' + beam_width = int(gen_text_ids.shape[0] / batch_size) + assert beam_width % sequence_beam_width == 0, "inner beam width must divide sequence beam width" + + expanded_frozen_embeddings = inputs["frozen_embeddings"][:, None, :].repeat((1, sequence_beam_width, 1)) + + + frozen_embeddings_per_beam = expanded_frozen_embeddings.reshape( + (batch_size, sequence_beam_width * num_recursive_steps, -1) + ) + + + distances_per_beam = ms.ops.CosineSimilarity(dim=2)( + hypothesis_embedding.reshape((batch_size, beam_width, -1)), + frozen_embeddings_per_beam + ) + + scores = transition_scores.reshape((batch_size, beam_width)) + best_idx_in_beam_total = ms.ops.TopK(k=beam_width)(scores, axis=1).indices + hypothesis_embedding = hypothesis_embedding.reshape((batch_size, beam_width, -1)) + gen_text_ids = gen_text_ids.reshape((batch_size, beam_width, -1)) + + + best_idx_in_beam = self._select_best_idx_in_beam( + best_idx_in_beam_total, + gen_text_ids, + sequence_beam_width + ) + #原来的太长了,改用局部变量 + reshaped_hypothesis_embedding = hypothesis_embedding.reshape((batch_size, beam_width, -1)) + indices = ms.ops.arange(batch_size)[:, None] + hypothesis_embedding = reshaped_hypothesis_embedding[indices, best_idx_in_beam] + + #原来的太长了,改用局部变量 + reshaped_gen_text_ids = gen_text_ids.reshape((batch_size, beam_width, -1)) + indices = ms.ops.arange(batch_size)[:, None] + gen_text_ids = reshaped_gen_text_ids[indices, best_idx_in_beam] + + return gen_text_ids, hypothesis_embedding + + def _select_best_idx_in_beam(self, best_idx_in_beam_total, gen_text_ids, sequence_beam_width): + + '''no test for corrector''' + best_idx_in_beam = [] + for batch_idx in range(len(best_idx_in_beam_total)): + gen_text_set = set() # track uniqueness + best_idx_in_beam.append([]) + for j in best_idx_in_beam_total[batch_idx].tolist(): + gen_text_i = tuple(gen_text_ids[batch_idx, j].tolist()) + if gen_text_i not in gen_text_set: + gen_text_set.add(gen_text_i) + best_idx_in_beam[batch_idx].append(j) + if len(best_idx_in_beam[batch_idx]) == sequence_beam_width: + break + best_idx_in_beam = ms.Tensor(best_idx_in_beam) + return best_idx_in_beam + + + def get_frozen_embeddings(self, embedder_input_ids: ms.Tensor, embedder_attention_mask: ms.Tensor,) -> ms.Tensor: + '''get frozen embeddings''' + + + frozen_embeddings = self.inversion_trainer.call_embedding_model( + input_ids=embedder_input_ids, + attention_mask=embedder_attention_mask, + ) + + return frozen_embeddings + + def embed_generated_hypothesis(self, input_ids: ms.Tensor) -> ms.Tensor: + """Embeds a generated hypothesis. Has to remove EOS token and add BOS token + at the beginning. + """ + inputs_str = self.tokenizer.batch_decode(input_ids, skip_special_tokens=True) + emb_input_ids = self.embedder_tokenizer( + inputs_str, + max_length=self.model.config.max_seq_length, + truncation=True, + padding="max_length", + return_tensors="ms", + ) + return self.get_frozen_embeddings( + embedder_input_ids=emb_input_ids.input_ids, + embedder_attention_mask=emb_input_ids.attention_mask, + ) + + def _get_hypothesis_uncached(self, inputs: Dict[str, ms.Tensor]) -> ms.Tensor: + ''' + get hypothesis uncached + ''' + if "frozen_embeddings" in inputs: + frozen_embeddings = inputs["frozen_embeddings"] + elif "embedder_input_ids" in inputs: + frozen_embeddings = self.get_frozen_embeddings( + embedder_input_ids=inputs["embedder_input_ids"], + embedder_attention_mask=inputs["embedder_attention_mask"], + ) + else: + assert ( + "input_ids" in inputs + ), f"cannot generate hypothesis with input keys: {inputs.keys()}" + frozen_embeddings = self.embed_generated_hypothesis( + input_ids=inputs["input_ids"] + ) + + generation_kwargs = { + "early_stopping": False, + "num_beams": 1, + "do_sample": False, + "no_repeat_ngram_size": 0, + "max_length": self.model.config.max_seq_length, + } + + hypothesis_input_ids = self.inversion_trainer.model.generate_corrector( + inputs={ + "frozen_embeddings": frozen_embeddings, + }, + generation_kwargs=generation_kwargs, + ) + hypothesis_attention_mask = ( + hypothesis_input_ids != self.model.encoder_decoder.config.pad_token_id + ) + hypothesis_embedding = self.embed_generated_hypothesis( + input_ids=hypothesis_input_ids + ) + return ( #打个断点,检查一下数据都对不对 + frozen_embeddings, + hypothesis_input_ids, + hypothesis_attention_mask, + hypothesis_embedding, + ) + #pylint: disable=W0613 + def compute_loss(self, model: CorrectorEncoderModel, inputs: Dict[str, ms.Tensor], + return_outputs: bool = False,) -> Union[Tuple[ms.Tensor, Dict[str, ms.Tensor]], ms.Tensor]: + ''' + compute loss + ''' + #batch_size, seq_length = inputs["input_ids"].shape + + try: + frozen_embeddings = inputs["frozen_embeddings"] + hypothesis_input_ids = inputs["hypothesis_input_ids"] + hypothesis_attention_mask = inputs["hypothesis_attention_mask"] + hypothesis_embedding = inputs["hypothesis_embedding"] + except KeyError: + print("+++++++++") + ( + frozen_embeddings, + hypothesis_input_ids, + hypothesis_attention_mask, + hypothesis_embedding, + ) = self._get_hypothesis_uncached(inputs=inputs) + + labels = inputs["labels"] + outputs = self.model( + embedding=frozen_embeddings, + hypothesis_embedding=hypothesis_embedding, + hypothesis_input_ids=hypothesis_input_ids, + hypothesis_attention_mask=hypothesis_attention_mask, + labels=labels, + ) + return outputs.loss + + #pylint: disable=W0613 + def prediction_step(self, model: ms.nn.Cell, inputs: Dict[str, Union[ms.Tensor, Any]], prediction_loss_only: bool, + ignore_keys: Optional[List[str]] = None,): + """Perform an evaluation step on `model` using `inputs`. Called during self.evalaute()""" + inputs = {key: value for key, value in inputs.items()} + loss = self.compute_loss(model=model, inputs=inputs) + + logits, labels = None, None + return loss, logits, labels + + def _remap_state_dict(self, state_dict: Dict) -> Dict: + """Edit keys posthumously on model load.""" + # Rename keys for backward compatibility w/ model trained before + # we stopped sharing params between the ff layers + if {"embedding_transform.3.weight", "embedding_transform.3.bias",} <= state_dict.keys(): + print( + "Renaming keys", + {"embedding_transform.2.weight", "embedding_transform.2.bias"}, + "for backward compatibility.", + ) + state_dict["embedding_transform_1.0.weight"] = state_dict.pop( + "embedding_transform.0.weight" + ) + state_dict["embedding_transform_1.0.bias"] = state_dict.pop( + "embedding_transform.0.bias" + ) + state_dict["embedding_transform_1.3.weight"] = state_dict.pop( + "embedding_transform.3.weight" + ) + state_dict["embedding_transform_1.3.bias"] = state_dict.pop( + "embedding_transform.3.bias" + ) + # + state_dict["embedding_transform_2.0.weight"] = state_dict[ + "embedding_transform_1.0.weight" + ] + state_dict["embedding_transform_2.0.bias"] = state_dict[ + "embedding_transform_1.0.bias" + ] + state_dict["embedding_transform_2.3.weight"] = state_dict[ + "embedding_transform_1.3.weight" + ] + state_dict["embedding_transform_2.3.bias"] = state_dict[ + "embedding_transform_1.3.bias" + ] + # + state_dict["embedding_transform_3.0.weight"] = state_dict[ + "embedding_transform_1.0.weight" + ] + state_dict["embedding_transform_3.0.bias"] = state_dict[ + "embedding_transform_1.0.bias" + ] + state_dict["embedding_transform_3.3.weight"] = state_dict[ + "embedding_transform_1.3.weight" + ] + state_dict["embedding_transform_3.3.bias"] = state_dict[ + "embedding_transform_1.3.bias" + ] + return state_dict diff --git a/examples/privacy/embedding_inversion/vec2text/trainers/inversion.py b/examples/privacy/embedding_inversion/vec2text/trainers/inversion.py new file mode 100644 index 0000000000000000000000000000000000000000..07936bea24b5d558ee37f4b89d3b9158b4bec8b2 --- /dev/null +++ b/examples/privacy/embedding_inversion/vec2text/trainers/inversion.py @@ -0,0 +1,68 @@ +''' +inversion trainer +''' +from datetime import datetime +from typing import Dict + +import mindspore as ms + +from trainers.base import BaseTrainer + +class InversionTrainer(BaseTrainer): + + '''InversionTrainer''' + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.tokenizer = self.model.tokenizer + self.embedder_tokenizer = self.model.embedder_tokenizer + self.call_embedding_model = self.model.call_embedding_model + self.embedder = self.model.embedder + self.counter = 0 + self.last_time_logger = datetime.now() + self.each_time_logger = datetime.now() + + def generate(self, inputs: Dict, generation_kwargs: Dict) -> ms.Tensor: + return self.model.generate(inputs=inputs, generation_kwargs=generation_kwargs) + + def training_step(self, model: ms.nn.Cell, inputs: Dict[str, ms.Tensor]) -> ms.Tensor: + """ + Performs a training step. we override to compute data-specific metrics. + """ + # TODO: Log training metrics from below... (How to do with huggingface?) + self.counter += 1 + print(self.counter, " ", datetime.now() - self.each_time_logger) + self.each_time_logger = datetime.now() + + if self.counter % 100 == 0: + print("this 100 step consume:") + print(datetime.now()-self.last_time_logger) + self.last_time_logger = datetime.now() + self._compute_data_metrics(inputs=inputs) + # self.log({ f"train/{k}": v for k,v in metrics.items() }) + return super().training_step(model, inputs) + + + def _remap_state_dict(self, state_dict: Dict) -> Dict: + """Edit keys posthumously on model load.""" + # Rename keys for backward compatibility w/ model trained before + # we added extra dropout to the model + if { + "embedding_transform.2.weight", + "embedding_transform.2.bias", + } <= state_dict.keys(): + print( + "Renaming keys", + {"embedding_transform.2.weight", "embedding_transform.2.bias"}, + "for backward compatibility.", + ) + state_dict["embedding_transform.3.weight"] = state_dict.pop( + "embedding_transform.2.weight" + ) + state_dict["embedding_transform.3.bias"] = state_dict.pop( + "embedding_transform.2.bias" + ) + return state_dict + + #def _prepare_input(self, x): + #return None diff --git a/examples/privacy/embedding_inversion/vec2text/utils/__init__.py b/examples/privacy/embedding_inversion/vec2text/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9ab8dcc58e13c00e392a12c8ce78d8033ef0ebfe --- /dev/null +++ b/examples/privacy/embedding_inversion/vec2text/utils/__init__.py @@ -0,0 +1,4 @@ +''' +init +''' +from .utils import * # noqa: F401,F403 diff --git a/examples/privacy/embedding_inversion/vec2text/utils/utils.py b/examples/privacy/embedding_inversion/vec2text/utils/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..0903b2baacf2a8a0e2109a40ac0a5f11224e8dba --- /dev/null +++ b/examples/privacy/embedding_inversion/vec2text/utils/utils.py @@ -0,0 +1,108 @@ +''' +utiliaztions for training +''' + +import multiprocessing +import os +from typing import Callable + +import tqdm +import datasets +import mindspore as ms +from mindnlp.transformers import AutoTokenizer + +datasets.disable_caching() + +def emb(model: ms.nn.Cell, input_ids: ms.Tensor, attention_mask: ms.Tensor) -> ms.Tensor: + model.set_train(False) + embedding = model.call_embedding_model( + input_ids=input_ids, attention_mask=attention_mask + ) + model.set_train(True) + return embedding + +def get_world_size() -> int: + try: + return os.environ.get("WORLD_SIZE", 1) + except (RuntimeError, ValueError): + return 1 + + +def get_num_proc() -> int: + world_size: int = get_world_size() + try: + # os.sched_getaffinity respects schedulers, unlike cpu_count(), but it's only available + # on some Unix platforms, so we support both! + return len(os.sched_getaffinity(0)) // world_size # type: ignore[attr-defined] + except AttributeError: + return multiprocessing.cpu_count() // world_size + +#pylint: disable=C0103 +def embed_all_tokens(model: ms.nn.Cell, tokenizer: AutoTokenizer): + """Generates embeddings for all tokens in tokenizer vocab.""" + i = 0 + model.embedder.eval() + batch_size = 1024 + all_token_embeddings = [] + v = tokenizer.vocab_size + # + # DPR has CLS and SEP. + # GTR has no CLS or start token at all, and has EOS at the end. + CLS = tokenizer.cls_token_id + SEP = (tokenizer.sep_token_id) or (tokenizer.eos_token_id) + assert SEP is not None + # + # device = next(model.parameters()).device + pbar = tqdm.tqdm( + desc="generating token embeddings", colour="#008080", total=v, leave=False + ) + while i < v: + # + minibatch_size = min(v - i, batch_size) + inputs = ms.arange(i, min(i + minibatch_size, v)) + # + if CLS is not None: + input_ids = ms.stack( + [ + ms.tensor([CLS]).repeat(len(inputs)), + inputs, + ms.tensor([SEP]).repeat(len(inputs)), + ] + ).T + else: + input_ids = ms.stack([inputs, ms.tensor([SEP]).repeat(len(inputs))]).T + # input_ids = input_ids.to(device) + # + attention_mask = ms.ones_like(input_ids) + # + model.set_train(False) + token_embeddings = emb(model, input_ids, attention_mask) + model.set_train(True) + all_token_embeddings.extend(token_embeddings) + i += batch_size + pbar.update(batch_size) + # + all_token_embeddings_tensor: ms.Tensor = ms.stack(all_token_embeddings) + assert all_token_embeddings_tensor.shape == (tokenizer.vocab_size, 768) + + all_token_embeddings_tensor /= all_token_embeddings_tensor.norm( + p=2, dim=1, keepdim=True + ) + return all_token_embeddings_tensor + + +def convert_to_tensor(data): + return ms.Tensor(data, ms.int64) +def add_index(data, idx): + # + data["idx"] = idx + + return data + +def dataset_map_single_worker(dataset, map_fn: Callable, *args, **kwargs) -> datasets.Dataset: + # kwargs["num_proc"] = kwargs.get("num_proc", 1) + + das = dataset.map(map_fn, *args, **kwargs) + return das + +manifest_object = None