pydantic/pydantic-aiPublic

NotificationsYou must be signed in to change notification settings
Fork1k
Star11k

Commit036428c

Sumered

and

DouweM

authored

Added support for google specific arguments for video analysis (#2110)

Co-authored-by: Douwe Maan <douwe@pydantic.dev>

1 parentf49197c commit036428cCopy full SHA for 036428c

File tree

9 files changed

+270

-9

lines changed

pydantic_ai_slim
- pydantic_ai
  - messages.py
  - models
    - google.py
- pyproject.toml
tests
- conftest.py
- models
  - cassettes/test_google
    - test_google_model_video_as_binary_content_input_with_vendor_metadata.yaml
    - test_google_model_youtube_video_url_input_with_vendor_metadata.yaml
  - test_google.py
- test_agent.py
uv.lock

9 files changed

+270

-9

lines changed

`‎pydantic_ai_slim/pydantic_ai/messages.py`

Lines changed: 14 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -99,6 +99,13 @@ class FileUrl(ABC):`
`99`	`99`	`* If False, the URL is sent directly to the model and no download is performed.`
`100`	`100`	`"""`
`101`	`101`
	`102`	`+vendor_metadata:dict[str,Any]\|None=None`
	`103`	`+"""Vendor-specific metadata for the file.`
	`104`	`+`
	`105`	`+ Supported by:`
	`106`	+ - `GoogleModel`: `VideoUrl.vendor_metadata` is used as `video_metadata`: https://ai.google.dev/gemini-api/docs/video-understanding#customize-video-processing
	`107`	`+ """`
	`108`	`+`
`102`	`109`	`@property`
`103`	`110`	`@abstractmethod`
`104`	`111`	`defmedia_type(self)->str:`
`@@ -263,6 +270,13 @@ class BinaryContent:`
`263`	`270`	`media_type:AudioMediaType\|ImageMediaType\|DocumentMediaType\|str`
`264`	`271`	`"""The media type of the binary data."""`
`265`	`272`
	`273`	`+vendor_metadata:dict[str,Any]\|None=None`
	`274`	`+"""Vendor-specific metadata for the file.`
	`275`	`+`
	`276`	`+ Supported by:`
	`277`	+ - `GoogleModel`: `BinaryContent.vendor_metadata` is used as `video_metadata`: https://ai.google.dev/gemini-api/docs/video-understanding#customize-video-processing
	`278`	`+ """`
	`279`	`+`
`266`	`280`	`kind:Literal['binary']='binary'`
`267`	`281`	`"""Type identifier, this is available on all parts as a discriminator."""`
`268`	`282`

`‎pydantic_ai_slim/pydantic_ai/models/google.py`

Lines changed: 16 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -55,6 +55,7 @@`
`55`	`55`	`GenerateContentConfigDict,`
`56`	`56`	`GenerateContentResponse,`
`57`	`57`	`HttpOptionsDict,`
	`58`	`+MediaResolution,`
`58`	`59`	`Part,`
`59`	`60`	`PartDict,`
`60`	`61`	`SafetySettingDict,`
`@@ -120,6 +121,12 @@ class GoogleModelSettings(ModelSettings, total=False):`
`120`	`121`	`See the [Gemini API docs](https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal/add-labels-to-api-calls) for use cases and limitations.`
`121`	`122`	`"""`
`122`	`123`
	`124`	`+google_video_resolution:MediaResolution`
	`125`	`+"""The video resolution to use for the model.`
	`126`	`+`
	`127`	`+ See <https://ai.google.dev/api/generate-content#MediaResolution> for more information.`
	`128`	`+ """`
	`129`	`+`
`123`	`130`
`124`	`131`	`@dataclass(init=False)`
`125`	`132`	`classGoogleModel(Model):`
`@@ -291,6 +298,7 @@ async def _generate_content(`
`291`	`298`	`safety_settings=model_settings.get('google_safety_settings'),`
`292`	`299`	`thinking_config=model_settings.get('google_thinking_config'),`
`293`	`300`	`labels=model_settings.get('google_labels'),`
	`301`	`+media_resolution=model_settings.get('google_video_resolution'),`
`294`	`302`	`tools=cast(ToolListUnionDict,tools),`
`295`	`303`	`tool_config=tool_config,`
`296`	`304`	`response_mime_type=response_mime_type,`
`@@ -398,9 +406,15 @@ async def _map_user_prompt(self, part: UserPromptPart) -> list[PartDict]:`
`398`	`406`	`elifisinstance(item,BinaryContent):`
`399`	`407`	# NOTE: The type from Google GenAI is incorrect, it should be `str`, not `bytes`.
`400`	`408`	`base64_encoded=base64.b64encode(item.data).decode('utf-8')`
`401`		`-content.append({'inline_data': {'data':base64_encoded,'mime_type':item.media_type}})# type: ignore`
	`409`	`+inline_data_dict= {'inline_data': {'data':base64_encoded,'mime_type':item.media_type}}`
	`410`	`+ifitem.vendor_metadata:`
	`411`	`+inline_data_dict['video_metadata']=item.vendor_metadata`
	`412`	`+content.append(inline_data_dict)# type: ignore`
`402`	`413`	`elifisinstance(item,VideoUrl)anditem.is_youtube:`
`403`		`-content.append({'file_data': {'file_uri':item.url,'mime_type':item.media_type}})`
	`414`	`+file_data_dict= {'file_data': {'file_uri':item.url,'mime_type':item.media_type}}`
	`415`	`+ifitem.vendor_metadata:`
	`416`	`+file_data_dict['video_metadata']=item.vendor_metadata`
	`417`	`+content.append(file_data_dict)# type: ignore`
`404`	`418`	`elifisinstance(item,FileUrl):`
`405`	`419`	`ifself.system=='google-gla'oritem.force_download:`
`406`	`420`	`downloaded_item=awaitdownload_item(item,data_format='base64')`

`‎pydantic_ai_slim/pyproject.toml`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -64,7 +64,7 @@ logfire = ["logfire>=3.11.0"]`
`64`	`64`	`openai = ["openai>=1.76.0"]`
`65`	`65`	`cohere = ["cohere>=5.13.11; platform_system != 'Emscripten'"]`
`66`	`66`	`vertexai = ["google-auth>=2.36.0","requests>=2.32.2"]`
`67`		`-google = ["google-genai>=1.15.0"]`
	`67`	`+google = ["google-genai>=1.24.0"]`
`68`	`68`	`anthropic = ["anthropic>=0.52.0"]`
`69`	`69`	`groq = ["groq>=0.19.0"]`
`70`	`70`	`mistral = ["mistralai>=1.2.5"]`

`‎tests/conftest.py`

Lines changed: 25 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -10,6 +10,7 @@`
`10`	`10`	`fromcontextlibimportcontextmanager`
`11`	`11`	`fromdataclassesimportdataclass`
`12`	`12`	`fromdatetimeimportdatetime`
	`13`	`+fromfunctoolsimportcached_property`
`13`	`14`	`frompathlibimportPath`
`14`	`15`	`fromtypesimportModuleType`
`15`	`16`	`fromtypingimportTYPE_CHECKING,Any,Callable`
`@@ -19,7 +20,7 @@`
`19`	`20`	`from_pytest.assertion.rewriteimportAssertionRewritingHook`
`20`	`21`	`frompytest_mockimportMockerFixture`
`21`	`22`	`fromtyping_extensionsimportTypeAlias`
`22`		`-fromvcrimportVCR`
	`23`	`+fromvcrimportVCR,requestasvcr_request`
`23`	`24`
`24`	`25`	`importpydantic_ai.models`
`25`	`26`	`frompydantic_ai.messagesimportBinaryContent`
`@@ -194,6 +195,29 @@ def pytest_recording_configure(config: Any, vcr: VCR):`
`194`	`195`
`195`	`196`	`vcr.register_serializer('yaml',json_body_serializer)`
`196`	`197`
	`198`	`+defmethod_matcher(r1:vcr_request.Request,r2:vcr_request.Request)->None:`
	`199`	`+ifr1.method.upper()!=r2.method.upper():`
	`200`	`+raiseAssertionError(f'{r1.method} !={r2.method}')`
	`201`	`+`
	`202`	`+vcr.register_matcher('method',method_matcher)`
	`203`	`+`
	`204`	`+`
	`205`	`+@pytest.fixture(autouse=True)`
	`206`	`+defmock_vcr_aiohttp_content(mocker:MockerFixture):`
	`207`	`+try:`
	`208`	`+fromvcr.stubsimportaiohttp_stubs`
	`209`	`+exceptImportError:`
	`210`	`+return`
	`211`	`+`
	`212`	+# google-genai calls `self.response_stream.content.readline()` where `self.response_stream` is a `MockClientResponse`,
	`213`	+# which creates a new `MockStream` each time instead of returning the same one, resulting in the readline cursor not being respected.
	`214`	+# So we turn `content` into a cached property to return the same one each time.
	`215`	`+# VCR issue: https://github.com/kevin1024/vcrpy/issues/927. Once that's is resolved, we can remove this patch.`
	`216`	`+cached_content=cached_property(aiohttp_stubs.MockClientResponse.content.fget)# type: ignore`
	`217`	`+cached_content.__set_name__(aiohttp_stubs.MockClientResponse,'content')`
	`218`	`+mocker.patch('vcr.stubs.aiohttp_stubs.MockClientResponse.content',new=cached_content)`
	`219`	`+mocker.patch('vcr.stubs.aiohttp_stubs.MockStream.set_exception',return_value=None)`
	`220`	`+`
`197`	`221`
`198`	`222`	`@pytest.fixture(scope='module')`
`199`	`223`	`defvcr_config():`

`‎tests/models/cassettes/test_google/test_google_model_video_as_binary_content_input_with_vendor_metadata.yaml`

Lines changed: 80 additions & 0 deletions

Large diffs are not rendered by default.

`‎tests/models/cassettes/test_google/test_google_model_youtube_video_url_input_with_vendor_metadata.yaml`

Lines changed: 73 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,73 @@`
	`1`	`+interactions:`
	`2`	`+-request:`
	`3`	`+headers:`
	`4`	`+content-type:`
	`5`	`+ -application/json`
	`6`	`+method:post`
	`7`	`+parsed_body:`
	`8`	`+contents:`
	`9`	`+ -parts:`
	`10`	`+ -text:Explain me this video`
	`11`	`+ -fileData:`
	`12`	`+fileUri:https://youtu.be/lCdaVNyHtjU`
	`13`	`+mimeType:video/mp4`
	`14`	`+videoMetadata:`
	`15`	`+fps:0.2`
	`16`	`+role:user`
	`17`	`+generationConfig:{}`
	`18`	`+systemInstruction:`
	`19`	`+parts:`
	`20`	`+ -text:You are a helpful chatbot.`
	`21`	`+role:user`
	`22`	`+uri:https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent`
	`23`	`+response:`
	`24`	`+headers:`
	`25`	`+alt-svc:`
	`26`	`+ -h3=":443"; ma=2592000,h3-29=":443"; ma=2592000`
	`27`	`+content-length:`
	`28`	`+ -'2759'`
	`29`	`+content-type:`
	`30`	`+ -application/json; charset=UTF-8`
	`31`	`+server-timing:`
	`32`	`+ -gfet4t7; dur=11467`
	`33`	`+transfer-encoding:`
	`34`	`+ -chunked`
	`35`	`+vary:`
	`36`	`+ -Origin`
	`37`	`+ -X-Origin`
	`38`	`+ -Referer`
	`39`	`+parsed_body:`
	`40`	`+candidates:`
	`41`	`+ -avgLogprobs:-0.4793745385795377`
	`42`	`+content:`
	`43`	`+parts:`
	`44`	`+ -text:\|-`
	`45`	`+ Okay, based on the image, here's what I can infer:`
	`46`	`+`
	`47`	`+ * A camera monitor is mounted on top of a camera.`
	`48`	`+ * The monitor's screen is on, displaying a view of the rocky mountains.`
	`49`	`+ * This setting suggests a professional video shoot.`
	`50`	`+`
	`51`	`+ If you'd like a more detailed explanation, please provide additional information about the video.`
	`52`	`+role:model`
	`53`	`+finishReason:STOP`
	`54`	`+modelVersion:gemini-2.0-flash`
	`55`	`+responseId:ldpraPqBM6HshMIPgsi60QI`
	`56`	`+usageMetadata:`
	`57`	`+candidatesTokenCount:459`
	`58`	`+candidatesTokensDetails:`
	`59`	`+ -modality:TEXT`
	`60`	`+tokenCount:459`
	`61`	`+promptTokenCount:4605`
	`62`	`+promptTokensDetails:`
	`63`	`+ -modality:TEXT`
	`64`	`+tokenCount:10`
	`65`	`+ -modality:AUDIO`
	`66`	`+tokenCount:1475`
	`67`	`+ -modality:VIDEO`
	`68`	`+tokenCount:3120`
	`69`	`+totalTokenCount:5064`
	`70`	`+status:`
	`71`	`+code:200`
	`72`	`+message:OK`
	`73`	`+version:1`

`‎tests/models/test_google.py`

Lines changed: 43 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -420,6 +420,23 @@ async def test_google_model_video_as_binary_content_input(`
`420`	`420`	`""")`
`421`	`421`
`422`	`422`
	`423`	`+asyncdeftest_google_model_video_as_binary_content_input_with_vendor_metadata(`
	`424`	`+allow_model_requests:None,video_content:BinaryContent,google_provider:GoogleProvider`
	`425`	`+):`
	`426`	`+m=GoogleModel('gemini-2.0-flash',provider=google_provider)`
	`427`	`+agent=Agent(m,system_prompt='You are a helpful chatbot.')`
	`428`	`+video_content.vendor_metadata= {'start_offset':'2s','end_offset':'10s'}`
	`429`	`+`
	`430`	`+result=awaitagent.run(['Explain me this video',video_content])`
	`431`	`+assertresult.output==snapshot("""\`
	`432`	`+Okay, I can describe what is visible in the image.`
	`433`	`+`
	`434`	`+The image shows a camera setup in an outdoor setting. The camera is mounted on a tripod and has an external monitor attached to it. The monitor is displaying a scene that appears to be a desert landscape with rocky formations and mountains in the background. The foreground and background of the overall image, outside of the camera monitor, is also a blurry, desert landscape. The colors in the background are warm and suggest either sunrise, sunset, or reflected light off the rock formations.`
	`435`	`+`
	`436`	`+It looks like someone is either reviewing footage on the monitor, or using it as an aid for framing the shot.\`
	`437`	`+""")`
	`438`	`+`
	`439`	`+`
`423`	`440`	`asyncdeftest_google_model_image_url_input(allow_model_requests:None,google_provider:GoogleProvider):`
`424`	`441`	`m=GoogleModel('gemini-2.0-flash',provider=google_provider)`
`425`	`442`	`agent=Agent(m,system_prompt='You are a helpful chatbot.')`
`@@ -454,6 +471,32 @@ async def test_google_model_video_url_input(allow_model_requests: None, google_p`
`454`	`471`	`""")`
`455`	`472`
`456`	`473`
	`474`	`+asyncdeftest_google_model_youtube_video_url_input_with_vendor_metadata(`
	`475`	`+allow_model_requests:None,google_provider:GoogleProvider`
	`476`	`+):`
	`477`	`+m=GoogleModel('gemini-2.0-flash',provider=google_provider)`
	`478`	`+agent=Agent(m,system_prompt='You are a helpful chatbot.')`
	`479`	`+`
	`480`	`+result=awaitagent.run(`
	`481`	`+ [`
	`482`	`+'Explain me this video',`
	`483`	`+VideoUrl(`
	`484`	`+url='https://youtu.be/lCdaVNyHtjU',`
	`485`	`+vendor_metadata={'fps':0.2},`
	`486`	`+ ),`
	`487`	`+ ]`
	`488`	`+ )`
	`489`	`+assertresult.output==snapshot("""\`
	`490`	`+Okay, based on the image, here's what I can infer:`
	`491`	`+`
	`492`	`+* A camera monitor is mounted on top of a camera.`
	`493`	`+* The monitor's screen is on, displaying a view of the rocky mountains.`
	`494`	`+* This setting suggests a professional video shoot.`
	`495`	`+`
	`496`	`+If you'd like a more detailed explanation, please provide additional information about the video.\`
	`497`	`+""")`
	`498`	`+`
	`499`	`+`
`457`	`500`	`asyncdeftest_google_model_document_url_input(allow_model_requests:None,google_provider:GoogleProvider):`
`458`	`501`	`m=GoogleModel('gemini-2.0-flash',provider=google_provider)`
`459`	`502`	`agent=Agent(m,system_prompt='You are a helpful chatbot.')`

`‎tests/test_agent.py`

Lines changed: 4 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -2829,7 +2829,10 @@ def test_binary_content_all_messages_json():`
`2829`	`2829`	`{`
`2830`	`2830`	`'parts': [`
`2831`	`2831`	`{`
`2832`		`-'content': ['Hello', {'data':'SGVsbG8=','media_type':'text/plain','kind':'binary'}],`
	`2832`	`+'content': [`
	`2833`	`+'Hello',`
	`2834`	`+ {'data':'SGVsbG8=','media_type':'text/plain','vendor_metadata':None,'kind':'binary'},`
	`2835`	`+ ],`
`2833`	`2836`	`'timestamp':IsStr(),`
`2834`	`2837`	`'part_kind':'user-prompt',`
`2835`	`2838`	`}`

`‎uv.lock`

Lines changed: 14 additions & 4 deletions

Some generated files are not rendered by default. Learn more aboutcustomizing how changed files appear on GitHub.

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit036428c

File tree

9 files changed

9 files changed

`‎pydantic_ai_slim/pydantic_ai/messages.py`

`‎pydantic_ai_slim/pydantic_ai/models/google.py`

`‎pydantic_ai_slim/pyproject.toml`

`‎tests/conftest.py`

`‎tests/models/cassettes/test_google/test_google_model_video_as_binary_content_input_with_vendor_metadata.yaml`

`‎tests/models/cassettes/test_google/test_google_model_youtube_video_url_input_with_vendor_metadata.yaml`

`‎tests/models/test_google.py`

`‎tests/test_agent.py`

`‎uv.lock`

0 commit comments