NotificationsYou must be signed in to change notification settings
Fork0
Star0

Commit1caeda0

authored

Improve serialization of LLMJudge and custom evaluators (pydantic#1367)

1 parentea34fc9 commit1caeda0Copy full SHA for 1caeda0

File tree

7 files changed

+105

-21

lines changed

docs/api/pydantic_evals
- evaluators.md
pydantic_ai_slim/pydantic_ai/models
- openai.py
pydantic_evals/pydantic_evals/evaluators
tests/evals
- test_evaluator_common.py
- test_evaluators.py

7 files changed

+105

-21

lines changed

`‎docs/api/pydantic_evals/evaluators.md`

Lines changed: 2 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
`1`	`1`	#`pydantic_evals.evaluators`
`2`	`2`
`3`	`3`	`::: pydantic_evals.evaluators`
	`4`	`+`
	`5`	`+::: pydantic_evals.evaluators.llm_as_a_judge`

`‎pydantic_ai_slim/pydantic_ai/models/openai.py`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -150,7 +150,7 @@ class OpenAIModel(Model):`
`150`	`150`	`"""`
`151`	`151`
`152`	`152`	`client:AsyncOpenAI=field(repr=False)`
`153`		`-system_prompt_role:OpenAISystemPromptRole\|None=field(default=None)`
	`153`	`+system_prompt_role:OpenAISystemPromptRole\|None=field(default=None,repr=False)`
`154`	`154`
`155`	`155`	`_model_name:OpenAIModelName=field(repr=False)`
`156`	`156`	`_system:str=field(default='openai',repr=False)`

`‎pydantic_evals/pydantic_evals/evaluators/common.py`

Lines changed: 17 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -155,10 +155,14 @@ def evaluate(self, ctx: EvaluatorContext[object, object, object]) -> bool:`
`155`	`155`
`156`	`156`	`@dataclass`
`157`	`157`	`classLLMJudge(Evaluator[object,object,object]):`
`158`		`-"""Judge whether the output of a language model meets the criteria of a provided rubric."""`
	`158`	`+"""Judge whether the output of a language model meets the criteria of a provided rubric.`
	`159`	`+`
	`160`	`+ If you do not specify a model, it uses the default model for judging. This starts as 'openai:gpt-4o', but can be`
	`161`	+ overridden by calling [`set_default_judge_model`][pydantic_evals.evaluators.llm_as_a_judge.set_default_judge_model].
	`162`	`+ """`
`159`	`163`
`160`	`164`	`rubric:str`
`161`		`-model:models.Model\|models.KnownModelName='openai:gpt-4o'`
	`165`	`+model:models.Model\|models.KnownModelName\|None=None`
`162`	`166`	`include_input:bool=False`
`163`	`167`
`164`	`168`	`asyncdefevaluate(`
`@@ -175,6 +179,17 @@ async def evaluate(`
`175`	`179`	`grading_output=awaitjudge_output(ctx.output,self.rubric,self.model)`
`176`	`180`	`returnEvaluationReason(value=grading_output.pass_,reason=grading_output.reason)`
`177`	`181`
	`182`	`+defbuild_serialization_arguments(self):`
	`183`	`+result=super().build_serialization_arguments()`
	`184`	`+# always serialize the model as a string when present; use its name if it's a KnownModelName`
	`185`	`+if (model:=result.get('model'))andisinstance(model,models.Model):`
	`186`	`+result['model']=f'{model.system}:{model.model_name}'`
	`187`	`+`
	`188`	`+# Note: this may lead to confusion if you try to serialize-then-deserialize with a custom model.`
	`189`	`+# I expect that is rare enough to be worth not solving yet, but common enough that we probably will want to`
	`190`	`+# solve it eventually. I'm imagining some kind of model registry, but don't want to work out the details yet.`
	`191`	`+returnresult`
	`192`	`+`
`178`	`193`
`179`	`194`	`@dataclass`
`180`	`195`	`classHasMatchingSpan(Evaluator[object,object,object]):`

`‎pydantic_evals/pydantic_evals/evaluators/evaluator.py`

Lines changed: 23 additions & 9 deletions

Original file line number	Diff line number	Diff line change
`@@ -223,6 +223,28 @@ def serialize(self, info: SerializationInfo) -> Any:`
`223`	`223`	`Returns:`
`224`	`224`	`A JSON-serializable representation of this evaluator as an EvaluatorSpec.`
`225`	`225`	`"""`
	`226`	`+raw_arguments=self.build_serialization_arguments()`
	`227`	`+`
	`228`	`+arguments:None\|tuple[Any,]\|dict[str,Any]`
	`229`	`+iflen(raw_arguments)==0:`
	`230`	`+arguments=None`
	`231`	`+eliflen(raw_arguments)==1:`
	`232`	`+arguments= (next(iter(raw_arguments.values())),)`
	`233`	`+else:`
	`234`	`+arguments=raw_arguments`
	`235`	`+returnto_jsonable_python(`
	`236`	`+EvaluatorSpec(name=self.name(),arguments=arguments),context=info.context,serialize_unknown=True`
	`237`	`+ )`
	`238`	`+`
	`239`	`+defbuild_serialization_arguments(self)->dict[str,Any]:`
	`240`	`+"""Build the arguments for serialization.`
	`241`	`+`
	`242`	+ Evaluators are serialized for inclusion as the "source" in an `EvaluationResult`.
	`243`	`+ If you want to modify how the evaluator is serialized for that or other purposes, you can override this method.`
	`244`	`+`
	`245`	`+ Returns:`
	`246`	`+ A dictionary of arguments to be used during serialization.`
	`247`	`+ """`
`226`	`248`	`raw_arguments:dict[str,Any]= {}`
`227`	`249`	`forfieldinfields(self):`
`228`	`250`	`value=getattr(self,field.name)`
`@@ -234,12 +256,4 @@ def serialize(self, info: SerializationInfo) -> Any:`
`234`	`256`	`ifvalue==field.default_factory():`
`235`	`257`	`continue`
`236`	`258`	`raw_arguments[field.name]=value`
`237`		`-`
`238`		`-arguments:None\|tuple[Any,]\|dict[str,Any]`
`239`		`-iflen(raw_arguments)==0:`
`240`		`-arguments=None`
`241`		`-eliflen(raw_arguments)==1:`
`242`		`-arguments= (next(iter(raw_arguments.values())),)`
`243`		`-else:`
`244`		`-arguments=raw_arguments`
`245`		`-returnto_jsonable_python(EvaluatorSpec(name=self.name(),arguments=arguments),context=info.context)`
	`259`	`+returnraw_arguments`

`‎pydantic_evals/pydantic_evals/evaluators/llm_as_a_judge.py`

Lines changed: 27 additions & 7 deletions

Original file line number	Diff line number	Diff line change
`@@ -8,7 +8,10 @@`
`8`	`8`
`9`	`9`	`frompydantic_aiimportAgent,models`
`10`	`10`
`11`		`-__all__= ('GradingOutput','judge_input_output','judge_output')`
	`11`	`+__all__= ('GradingOutput','judge_input_output','judge_output','set_default_judge_model')`
	`12`	`+`
	`13`	`+`
	`14`	`+_default_model:models.Model\|models.KnownModelName='openai:gpt-4o'`
`12`	`15`
`13`	`16`
`14`	`17`	`classGradingOutput(BaseModel,populate_by_name=True):`
`@@ -41,11 +44,15 @@ class GradingOutput(BaseModel, populate_by_name=True):`
`41`	`44`
`42`	`45`
`43`	`46`	`asyncdefjudge_output(`
`44`		`-output:Any,rubric:str,model:models.Model\|models.KnownModelName='openai:gpt-4o'`
	`47`	`+output:Any,rubric:str,model:models.Model\|models.KnownModelName\|None=None`
`45`	`48`	`)->GradingOutput:`
`46`		`-"""Judge the output of a model based on a rubric."""`
	`49`	`+"""Judge the output of a model based on a rubric.`
	`50`	`+`
	`51`	`+ If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',`
	`52`	+ but this can be changed using the `set_default_judge_model` function.
	`53`	`+ """`
`47`	`54`	`user_prompt=f'<Output>\n{_stringify(output)}\n</Output>\n<Rubric>\n{rubric}\n</Rubric>'`
`48`		`-return (await_judge_output_agent.run(user_prompt,model=model)).data`
	`55`	`+return (await_judge_output_agent.run(user_prompt,model=modelor_default_model)).data`
`49`	`56`
`50`	`57`
`51`	`58`	`_judge_input_output_agent=Agent(`
`@@ -72,11 +79,24 @@ async def judge_output(`
`72`	`79`
`73`	`80`
`74`	`81`	`asyncdefjudge_input_output(`
`75`		`-inputs:Any,output:Any,rubric:str,model:models.Model\|models.KnownModelName='openai:gpt-4o'`
	`82`	`+inputs:Any,output:Any,rubric:str,model:models.Model\|models.KnownModelName\|None=None`
`76`	`83`	`)->GradingOutput:`
`77`		`-"""Judge the output of a model based on the inputs and a rubric."""`
	`84`	`+"""Judge the output of a model based on the inputs and a rubric.`
	`85`	`+`
	`86`	`+ If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',`
	`87`	+ but this can be changed using the `set_default_judge_model` function.
	`88`	`+ """`
`78`	`89`	`user_prompt=f'<Input>\n{_stringify(inputs)}\n</Input>\n<Output>\n{_stringify(output)}\n</Output>\n<Rubric>\n{rubric}\n</Rubric>'`
`79`		`-return (await_judge_input_output_agent.run(user_prompt,model=model)).data`
	`90`	`+return (await_judge_input_output_agent.run(user_prompt,model=modelor_default_model)).data`
	`91`	`+`
	`92`	`+`
	`93`	`+defset_default_judge_model(model:models.Model\|models.KnownModelName)->None:# pragma: no cover`
	`94`	`+"""Set the default model used for judging.`
	`95`	`+`
	`96`	+ This model is used if `None` is passed to the `model` argument of `judge_output` and `judge_input_output`.
	`97`	`+ """`
	`98`	`+global_default_model`
	`99`	`+_default_model=model`
`80`	`100`
`81`	`101`
`82`	`102`	`def_stringify(value:Any)->str:`

`‎tests/evals/test_evaluator_common.py`

Lines changed: 2 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -222,10 +222,10 @@ async def test_llm_judge_evaluator(mocker: MockerFixture):`
`222`	`222`	`assertresult.valueisTrue`
`223`	`223`	`assertresult.reason=='Test passed'`
`224`	`224`
`225`		`-mock_judge_output.assert_called_once_with('Hello world','Content contains a greeting','openai:gpt-4o')`
	`225`	`+mock_judge_output.assert_called_once_with('Hello world','Content contains a greeting',None)`
`226`	`226`
`227`	`227`	`# Test with input`
`228`		`-evaluator=LLMJudge(rubric='Output contains input',include_input=True)`
	`228`	`+evaluator=LLMJudge(rubric='Output contains input',include_input=True,model='openai:gpt-4o')`
`229`	`229`	`result=awaitevaluator.evaluate(ctx)`
`230`	`230`	`assertisinstance(result,EvaluationReason)`
`231`	`231`	`assertresult.valueisTrue`

`‎tests/evals/test_evaluators.py`

Lines changed: 33 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -7,6 +7,11 @@`
`7`	`7`	`frominline_snapshotimportsnapshot`
`8`	`8`	`frompydanticimportBaseModel,TypeAdapter`
`9`	`9`
	`10`	`+frompydantic_ai.messagesimportModelMessage,ModelResponse`
	`11`	`+frompydantic_ai.modelsimportModel,ModelRequestParameters`
	`12`	`+frompydantic_ai.settingsimportModelSettings`
	`13`	`+frompydantic_ai.usageimportUsage`
	`14`	`+`
`10`	`15`	`from ..conftestimporttry_import`
`11`	`16`
`12`	`17`	`withtry_import()asimports_successful:`
`@@ -108,6 +113,34 @@ async def test_evaluator_spec_serialization():`
`108`	`113`	`assertadapter.dump_python(spec_single_arg,context={'use_short_form':True})==snapshot({'MyEvaluator':'value1'})`
`109`	`114`
`110`	`115`
	`116`	`+asyncdeftest_llm_judge_serialization():`
	`117`	`+# Ensure models are serialized based on their system + name when used with LLMJudge`
	`118`	`+`
	`119`	`+classMyModel(Model):`
	`120`	`+asyncdefrequest(`
	`121`	`+self,`
	`122`	`+messages:list[ModelMessage],`
	`123`	`+model_settings:ModelSettings\|None,`
	`124`	`+model_request_parameters:ModelRequestParameters,`
	`125`	`+ )->tuple[ModelResponse,Usage]:`
	`126`	`+raiseNotImplementedError`
	`127`	`+`
	`128`	`+@property`
	`129`	`+defmodel_name(self)->str:`
	`130`	`+return'my-model'`
	`131`	`+`
	`132`	`+@property`
	`133`	`+defsystem(self)->str:`
	`134`	`+return'my-system'`
	`135`	`+`
	`136`	`+adapter=TypeAdapter(Evaluator)`
	`137`	`+`
	`138`	`+assertadapter.dump_python(LLMJudge(rubric='my rubric',model=MyModel()))== {`
	`139`	`+'name':'LLMJudge',`
	`140`	`+'arguments': {'model':'my-system:my-model','rubric':'my rubric'},`
	`141`	`+ }`
	`142`	`+`
	`143`	`+`
`111`	`144`	`asyncdeftest_evaluator_call(test_context:EvaluatorContext[TaskInput,TaskOutput,TaskMetadata]):`
`112`	`145`	`"""Test calling an Evaluator."""`
`113`	`146`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit1caeda0

File tree

7 files changed

7 files changed

`‎docs/api/pydantic_evals/evaluators.md`

`‎pydantic_ai_slim/pydantic_ai/models/openai.py`

`‎pydantic_evals/pydantic_evals/evaluators/common.py`

`‎pydantic_evals/pydantic_evals/evaluators/evaluator.py`

`‎pydantic_evals/pydantic_evals/evaluators/llm_as_a_judge.py`

`‎tests/evals/test_evaluator_common.py`

`‎tests/evals/test_evaluators.py`

0 commit comments