fromfastNLP.ioimportChnSentiCorpLoaderfromfunctoolsimportpartial# 会自动下载数据,并且可以通过文档看到返回的 dataset 应该是包含"raw_words"和"target"两个field的data_bundle=ChnSentiCorpLoader().load()# 使用tokenizer对数据进行tokenizefrompaddlenlp.transformersimportBertTokenizertokenizer=BertTokenizer.from_pretrained('hfl/chinese-bert-wwm')tokenize=partial(tokenizer,max_length=256)# 限制一下最大长度data_bundle.apply_field_more(tokenize,field_name='raw_chars',num_proc=4)# 会新增"input_ids", "attention_mask"等field进入dataset中data_bundle.apply_field(int,field_name='target',new_field_name='labels')# 将int函数应用到每个target上,并且放入新的labels field中print(data_bundle.get_dataset('train')[:4])# 初始化 modelfrompaddlenlp.transformersimportBertForSequenceClassification,LinearDecayWithWarmupfrompaddleimportoptimizer,nnclassSeqClsModel(nn.Layer):def__init__(self,model_checkpoint,num_labels):super(SeqClsModel,self).__init__()self.num_labels=num_labelsself.bert=BertForSequenceClassification.from_pretrained(model_checkpoint)defforward(self,input_ids,token_type_ids=None,position_ids=None,attention_mask=None):logits=self.bert(input_ids,token_type_ids,position_ids,attention_mask)returnlogitsdeftrain_step(self,input_ids,labels,token_type_ids=None,position_ids=None,attention_mask=None):logits=self(input_ids,token_type_ids,position_ids,attention_mask)loss_fct=nn.CrossEntropyLoss()loss=loss_fct(logits.reshape((-1,self.num_labels)),labels.reshape((-1, )))return {"logits":logits,"loss":loss, }defevaluate_step(self,input_ids,token_type_ids=None,position_ids=None,attention_mask=None):logits=self(input_ids,token_type_ids,position_ids,attention_mask)return {"logits":logits, }model=SeqClsModel('hfl/chinese-bert-wwm',num_labels=2)# 准备dataloaderfromfastNLPimportprepare_dataloaderdls=prepare_dataloader(data_bundle,batch_size=16)# 训练过程中调整学习率。scheduler=LinearDecayWithWarmup(2e-5,total_steps=20*len(dls['train']),warmup=0.1)optimizer=optimizer.AdamW(parameters=model.parameters(),learning_rate=scheduler)# 准备训练fromfastNLPimportTrainer,Accuracy,LoadBestModelCallback,Eventcallbacks= [LoadBestModelCallback()# 将在训练结束之后,加载性能最优的model]# 在训练特定时机加入一些操作, 不同时机能够获取到的参数不一样,可以通过Trainer.on函数的文档查看每个时机的参数@Trainer.on(Event.on_before_backward())defprint_loss(trainer,outputs):iftrainer.global_forward_batches%10==0:# 每10个batch打印一次loss。print(outputs["loss"].item())trainer=Trainer(model=model,train_dataloader=dls['train'],optimizers=optimizer,device=0,evaluate_dataloaders=dls['dev'],metrics={'acc':Accuracy()},callbacks=callbacks,monitor='acc#acc',# Accuracy的update()函数需要pred,target两个参数,它们实际对应的就是以下的field。evaluate_output_mapping={'logits':'pred'},evaluate_input_mapping={'labels':'target'} )trainer.run()# 在测试集合上进行评测fromfastNLPimportEvaluatorevaluator=Evaluator(model=model,dataloaders=dls['test'],metrics={'acc':Accuracy()},# Accuracy的update()函数需要pred,target两个参数,它们实际对应的就是以下的field。output_mapping={'logits':'pred'},input_mapping={'labels':'target'})evaluator.run()