NotificationsYou must be signed in to change notification settings
Fork0
Star0

Commitc5d7413

committed

add treeString support for printSchema

1 parent386dd26 commitc5d7413Copy full SHA for c5d7413

File tree

2 files changed

+139

-0

lines changed

duckdb/experimental/spark/sql
- types.py
tests/fast/spark
- test_spark_dataframe.py

2 files changed

+139

-0

lines changed

`‎duckdb/experimental/spark/sql/types.py‎`

Lines changed: 66 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -894,6 +894,72 @@ def fieldNames(self) -> list[str]:`
`894`	`894`	`"""`
`895`	`895`	`returnlist(self.names)`
`896`	`896`
	`897`	`+deftreeString(self,level:Optional[int]=None)->str:`
	`898`	`+"""Returns a string representation of the schema in tree format.`
	`899`	`+`
	`900`	`+ Parameters`
	`901`	`+ ----------`
	`902`	`+ level : int, optional`
	`903`	`+ Maximum depth to print. If None, prints all levels.`
	`904`	`+`
	`905`	`+ Returns:`
	`906`	`+ -------`
	`907`	`+ str`
	`908`	`+ Tree-formatted schema string`
	`909`	`+`
	`910`	`+ Examples:`
	`911`	`+ --------`
	`912`	`+ >>> schema = StructType([StructField("age", IntegerType(), True)])`
	`913`	`+ >>> print(schema.treeString())`
	`914`	`+ root`
	`915`	`+ \|-- age: integer (nullable = true)`
	`916`	`+ """`
	`917`	`+def_tree_string(schema:"StructType",depth:int=0,max_depth:Optional[int]=None)->list[str]:`
	`918`	`+"""Recursively build tree string lines."""`
	`919`	`+lines= []`
	`920`	`+ifdepth==0:`
	`921`	`+lines.append("root")`
	`922`	`+`
	`923`	`+ifmax_depthisnotNoneanddepth>=max_depth:`
	`924`	`+returnlines`
	`925`	`+`
	`926`	`+forfieldinschema.fields:`
	`927`	`+indent=" "*depth`
	`928`	`+prefix=" \|-- "`
	`929`	`+nullable_str="true"iffield.nullableelse"false"`
	`930`	`+`
	`931`	`+# Handle nested StructType`
	`932`	`+ifisinstance(field.dataType,StructType):`
	`933`	`+lines.append(f"{indent}{prefix}{field.name}: struct (nullable ={nullable_str})")`
	`934`	`+# Recursively handle nested struct - don't skip any lines, root only appears at depth 0`
	`935`	`+nested_lines=_tree_string(field.dataType,depth+1,max_depth)`
	`936`	`+lines.extend(nested_lines)`
	`937`	`+# Handle ArrayType`
	`938`	`+elifisinstance(field.dataType,ArrayType):`
	`939`	`+element_type=field.dataType.elementType`
	`940`	`+ifisinstance(element_type,StructType):`
	`941`	`+lines.append(f"{indent}{prefix}{field.name}: array (nullable ={nullable_str})")`
	`942`	`+lines.append(f"{indent} \| \|-- element: struct (containsNull ={field.dataType.containsNull})")`
	`943`	`+nested_lines=_tree_string(element_type,depth+2,max_depth)`
	`944`	`+lines.extend(nested_lines)`
	`945`	`+else:`
	`946`	`+type_str=element_type.simpleString()`
	`947`	`+lines.append(f"{indent}{prefix}{field.name}: array<{type_str}> (nullable ={nullable_str})")`
	`948`	`+# Handle MapType`
	`949`	`+elifisinstance(field.dataType,MapType):`
	`950`	`+key_type=field.dataType.keyType.simpleString()`
	`951`	`+value_type=field.dataType.valueType.simpleString()`
	`952`	`+lines.append(f"{indent}{prefix}{field.name}: map<{key_type},{value_type}> (nullable ={nullable_str})")`
	`953`	`+# Handle simple types`
	`954`	`+else:`
	`955`	`+type_str=field.dataType.simpleString()`
	`956`	`+lines.append(f"{indent}{prefix}{field.name}:{type_str} (nullable ={nullable_str})")`
	`957`	`+`
	`958`	`+returnlines`
	`959`	`+`
	`960`	`+lines=_tree_string(self,0,level)`
	`961`	`+return"\n".join(lines)`
	`962`	`+`
`897`	`963`	`defneedConversion(self)->bool:# noqa: D102`
`898`	`964`	`# We need convert Row()/namedtuple into tuple()`
`899`	`965`	`returnTrue`

`‎tests/fast/spark/test_spark_dataframe.py‎`

Lines changed: 73 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -508,3 +508,76 @@ def test_printSchema_negative_level(self, spark):`
`508`	`508`
`509`	`509`	`withpytest.raises(PySparkValueError):`
`510`	`510`	`df.printSchema(level=-1)`
	`511`	`+`
	`512`	`+deftest_treeString_basic(self,spark):`
	`513`	`+data= [("Alice",25,5000)]`
	`514`	`+df=spark.createDataFrame(data, ["name","age","salary"])`
	`515`	`+tree=df.schema.treeString()`
	`516`	`+`
	`517`	`+asserttree.startswith("root\n")`
	`518`	`+assert" \|-- name:"intree`
	`519`	`+assert" \|-- age:"intree`
	`520`	`+assert" \|-- salary:"intree`
	`521`	`+assert"(nullable = true)"intree`
	`522`	`+asserttree.count(" \|-- ")==3`
	`523`	`+`
	`524`	`+deftest_treeString_nested_struct(self,spark):`
	`525`	`+fromspark_namespace.sql.typesimportIntegerType,StringType,StructField,StructType`
	`526`	`+`
	`527`	`+schema=StructType([`
	`528`	`+StructField("id",IntegerType(),True),`
	`529`	`+StructField("person",StructType([`
	`530`	`+StructField("name",StringType(),True),`
	`531`	`+StructField("age",IntegerType(),True)`
	`532`	`+ ]),True)`
	`533`	`+ ])`
	`534`	`+data= [(1, {"name":"Alice","age":25})]`
	`535`	`+df=spark.createDataFrame(data,schema)`
	`536`	`+tree=df.schema.treeString()`
	`537`	`+`
	`538`	`+assert"root\n"intree`
	`539`	`+assert" \|-- id:"intree`
	`540`	`+assert" \|-- person: struct (nullable = true)"intree`
	`541`	`+assert"name:"intree`
	`542`	`+assert"age:"intree`
	`543`	`+`
	`544`	`+deftest_treeString_with_level(self,spark):`
	`545`	`+fromspark_namespace.sql.typesimportIntegerType,StringType,StructField,StructType`
	`546`	`+`
	`547`	`+schema=StructType([`
	`548`	`+StructField("id",IntegerType(),True),`
	`549`	`+StructField("person",StructType([`
	`550`	`+StructField("name",StringType(),True),`
	`551`	`+StructField("details",StructType([`
	`552`	`+StructField("address",StringType(),True)`
	`553`	`+ ]),True)`
	`554`	`+ ]),True)`
	`555`	`+ ])`
	`556`	`+`
	`557`	`+data= [(1, {"name":"Alice","details": {"address":"123 Main St"}})]`
	`558`	`+df=spark.createDataFrame(data,schema)`
	`559`	`+`
	`560`	`+# Level 1 should only show top-level fields`
	`561`	`+tree_level_1=df.schema.treeString(level=1)`
	`562`	`+assert" \|-- id:"intree_level_1`
	`563`	`+assert" \|-- person: struct"intree_level_1`
	`564`	`+# Should not show nested field names at level 1`
	`565`	`+lines=tree_level_1.split('\n')`
	`566`	`+assertlen([lforlinlinesifl.strip()])<=3`
	`567`	`+`
	`568`	`+deftest_treeString_array_type(self,spark):`
	`569`	`+fromspark_namespace.sql.typesimportArrayType,StringType,StructField,StructType`
	`570`	`+`
	`571`	`+schema=StructType([`
	`572`	`+StructField("name",StringType(),True),`
	`573`	`+StructField("hobbies",ArrayType(StringType()),True)`
	`574`	`+ ])`
	`575`	`+`
	`576`	`+data= [("Alice", ["reading","coding"])]`
	`577`	`+df=spark.createDataFrame(data,schema)`
	`578`	`+tree=df.schema.treeString()`
	`579`	`+`
	`580`	`+assert"root\n"intree`
	`581`	`+assert" \|-- name:"intree`
	`582`	`+assert" \|-- hobbies: array<"intree`
	`583`	`+assert"(nullable = true)"intree`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commitc5d7413

File tree

2 files changed

2 files changed

`‎duckdb/experimental/spark/sql/types.py‎`

`‎tests/fast/spark/test_spark_dataframe.py‎`

0 commit comments