|
| 1 | +{ |
| 2 | +"cells": [ |
| 3 | + { |
| 4 | +"cell_type":"code", |
| 5 | +"execution_count":5, |
| 6 | +"metadata": {}, |
| 7 | +"outputs": [ |
| 8 | + { |
| 9 | +"name":"stdout", |
| 10 | +"output_type":"stream", |
| 11 | +"text": [ |
| 12 | +"root\n", |
| 13 | +" |-- name: string (nullable = true)\n", |
| 14 | +" |-- day: string (nullable = true)\n", |
| 15 | +" |-- month: string (nullable = true)\n", |
| 16 | +" |-- year: string (nullable = true)\n", |
| 17 | +"\n" |
| 18 | + ] |
| 19 | + } |
| 20 | + ], |
| 21 | +"source": [ |
| 22 | +"import findspark\n", |
| 23 | +"findspark.init()\n", |
| 24 | +"\n", |
| 25 | +"import pyspark\n", |
| 26 | +"from pyspark.sql import *\n", |
| 27 | +"from pyspark.sql.functions import *\n", |
| 28 | +"from pyspark.sql.types import *\n", |
| 29 | +"\n", |
| 30 | +"spark = SparkSession.builder\\\n", |
| 31 | +" .master(\"local[3]\")\\\n", |
| 32 | +" .appName(\"MiscDemo\")\\\n", |
| 33 | +" .getOrCreate()\n", |
| 34 | +"\n", |
| 35 | +"data_list = [(\"Ravi\",\"28\",\"1\",\"2002\"),\n", |
| 36 | +" (\"Abdul\",\"23\",\"5\",\"81\"), # 1981\n", |
| 37 | +" (\"John\",\"12\",\"12\",\"6\"), # 2006\n", |
| 38 | +" (\"Rosy\",\"7\",\"8\",\"63\"), # 1963\n", |
| 39 | +" (\"Abdul\",\"23\",\"5\",\"81\") # 1981\n", |
| 40 | +" ]\n", |
| 41 | +"raw_df = spark.createDataFrame(data_list).toDF(\"name\",\"day\",\"month\",\"year\").repartition(3)\n", |
| 42 | +"raw_df.printSchema()" |
| 43 | + ] |
| 44 | + }, |
| 45 | + { |
| 46 | +"cell_type":"code", |
| 47 | +"execution_count":7, |
| 48 | +"metadata": {}, |
| 49 | +"outputs": [ |
| 50 | + { |
| 51 | +"name":"stdout", |
| 52 | +"output_type":"stream", |
| 53 | +"text": [ |
| 54 | +"+-----+---+-----+----+-----------+\n", |
| 55 | +"| name|day|month|year| id|\n", |
| 56 | +"+-----+---+-----+----+-----------+\n", |
| 57 | +"|Abdul| 23| 5| 81| 0|\n", |
| 58 | +"| Ravi| 28| 1|2002| 8589934592|\n", |
| 59 | +"|Abdul| 23| 5| 81| 8589934593|\n", |
| 60 | +"| John| 12| 12| 6|17179869184|\n", |
| 61 | +"| Rosy| 7| 8| 63|17179869185|\n", |
| 62 | +"+-----+---+-----+----+-----------+\n", |
| 63 | +"\n" |
| 64 | + ] |
| 65 | + } |
| 66 | + ], |
| 67 | +"source": [ |
| 68 | +"df1 = raw_df.withColumn(\"id\", monotonically_increasing_id())\n", |
| 69 | +"df1.show()" |
| 70 | + ] |
| 71 | + }, |
| 72 | + { |
| 73 | +"cell_type":"code", |
| 74 | +"execution_count":8, |
| 75 | +"metadata": {}, |
| 76 | +"outputs": [ |
| 77 | + { |
| 78 | +"name":"stdout", |
| 79 | +"output_type":"stream", |
| 80 | +"text": [ |
| 81 | +"+-----+---+-----+------+-----------+\n", |
| 82 | +"| name|day|month| year| id|\n", |
| 83 | +"+-----+---+-----+------+-----------+\n", |
| 84 | +"|Abdul| 23| 5|1981.0| 0|\n", |
| 85 | +"| Ravi| 28| 1| 2002| 8589934592|\n", |
| 86 | +"|Abdul| 23| 5|1981.0| 8589934593|\n", |
| 87 | +"| John| 12| 12|2006.0|17179869184|\n", |
| 88 | +"| Rosy| 7| 8|1963.0|17179869185|\n", |
| 89 | +"+-----+---+-----+------+-----------+\n", |
| 90 | +"\n" |
| 91 | + ] |
| 92 | + } |
| 93 | + ], |
| 94 | +"source": [ |
| 95 | +"df2 = df1.withColumn(\"year\", expr(\"\"\"\n", |
| 96 | +" case when year < 21 then year + 2000\n", |
| 97 | +" when year < 100 then year + 1900\n", |
| 98 | +" else year\n", |
| 99 | +" end\"\"\"))\n", |
| 100 | +"df2.show()" |
| 101 | + ] |
| 102 | + }, |
| 103 | + { |
| 104 | +"cell_type":"code", |
| 105 | +"execution_count":9, |
| 106 | +"metadata": {}, |
| 107 | +"outputs": [ |
| 108 | + { |
| 109 | +"name":"stdout", |
| 110 | +"output_type":"stream", |
| 111 | +"text": [ |
| 112 | +"+-----+---+-----+----+-----------+\n", |
| 113 | +"| name|day|month|year| id|\n", |
| 114 | +"+-----+---+-----+----+-----------+\n", |
| 115 | +"|Abdul| 23| 5|1981| 0|\n", |
| 116 | +"| Ravi| 28| 1|2002| 8589934592|\n", |
| 117 | +"|Abdul| 23| 5|1981| 8589934593|\n", |
| 118 | +"| John| 12| 12|2006|17179869184|\n", |
| 119 | +"| Rosy| 7| 8|1963|17179869185|\n", |
| 120 | +"+-----+---+-----+----+-----------+\n", |
| 121 | +"\n" |
| 122 | + ] |
| 123 | + } |
| 124 | + ], |
| 125 | +"source": [ |
| 126 | +"df3 = df1.withColumn(\"year\", expr(\"\"\"\n", |
| 127 | +" case when year < 21 then cast(year as int) + 2000\n", |
| 128 | +" when year < 100 then cast(year as int) + 1900\n", |
| 129 | +" else year\n", |
| 130 | +" end\"\"\"))\n", |
| 131 | +"df3.show()" |
| 132 | + ] |
| 133 | + }, |
| 134 | + { |
| 135 | +"cell_type":"code", |
| 136 | +"execution_count":11, |
| 137 | +"metadata": {}, |
| 138 | +"outputs": [ |
| 139 | + { |
| 140 | +"name":"stdout", |
| 141 | +"output_type":"stream", |
| 142 | +"text": [ |
| 143 | +"+-----+---+-----+----+-----------+\n", |
| 144 | +"| name|day|month|year| id|\n", |
| 145 | +"+-----+---+-----+----+-----------+\n", |
| 146 | +"|Abdul| 23| 5|1981| 0|\n", |
| 147 | +"| Ravi| 28| 1|2002| 8589934592|\n", |
| 148 | +"|Abdul| 23| 5|1981| 8589934593|\n", |
| 149 | +"| John| 12| 12|2006|17179869184|\n", |
| 150 | +"| Rosy| 7| 8|1963|17179869185|\n", |
| 151 | +"+-----+---+-----+----+-----------+\n", |
| 152 | +"\n", |
| 153 | +"root\n", |
| 154 | +" |-- name: string (nullable = true)\n", |
| 155 | +" |-- day: string (nullable = true)\n", |
| 156 | +" |-- month: string (nullable = true)\n", |
| 157 | +" |-- year: integer (nullable = true)\n", |
| 158 | +" |-- id: long (nullable = false)\n", |
| 159 | +"\n" |
| 160 | + ] |
| 161 | + } |
| 162 | + ], |
| 163 | +"source": [ |
| 164 | +"df4 = df1.withColumn(\"year\", expr(\"\"\"\n", |
| 165 | +" case when year < 21 then year + 2000\n", |
| 166 | +" when year < 100 then year + 1900\n", |
| 167 | +" else year\n", |
| 168 | +" end\"\"\").cast(IntegerType()))\n", |
| 169 | +"df4.show()\n", |
| 170 | +"df4.printSchema()" |
| 171 | + ] |
| 172 | + }, |
| 173 | + { |
| 174 | +"cell_type":"code", |
| 175 | +"execution_count":14, |
| 176 | +"metadata": {}, |
| 177 | +"outputs": [ |
| 178 | + { |
| 179 | +"name":"stdout", |
| 180 | +"output_type":"stream", |
| 181 | +"text": [ |
| 182 | +"+-----+---+-----+----+-----------+\n", |
| 183 | +"| name|day|month|year| id|\n", |
| 184 | +"+-----+---+-----+----+-----------+\n", |
| 185 | +"|Abdul| 23| 5|1981| 0|\n", |
| 186 | +"| Ravi| 28| 1|2002| 8589934592|\n", |
| 187 | +"|Abdul| 23| 5|1981| 8589934593|\n", |
| 188 | +"| John| 12| 12|2006|17179869184|\n", |
| 189 | +"| Rosy| 7| 8|1963|17179869185|\n", |
| 190 | +"+-----+---+-----+----+-----------+\n", |
| 191 | +"\n" |
| 192 | + ] |
| 193 | + } |
| 194 | + ], |
| 195 | +"source": [ |
| 196 | +"df5 = df1.withColumn(\"day\", col(\"day\").cast(IntegerType()))\\\n", |
| 197 | +" .withColumn(\"month\", col(\"month\").cast(IntegerType()))\\\n", |
| 198 | +" .withColumn(\"year\", col(\"year\").cast(IntegerType()))\n", |
| 199 | +"\n", |
| 200 | +"df6 = df5.withColumn(\"year\", expr(\"\"\"\n", |
| 201 | +" case when year < 21 then year + 2000\n", |
| 202 | +" when year < 100 then year + 1900\n", |
| 203 | +" else year\n", |
| 204 | +" end\"\"\"))\n", |
| 205 | +"df6.show()" |
| 206 | + ] |
| 207 | + }, |
| 208 | + { |
| 209 | +"cell_type":"code", |
| 210 | +"execution_count":15, |
| 211 | +"metadata": {}, |
| 212 | +"outputs": [ |
| 213 | + { |
| 214 | +"name":"stdout", |
| 215 | +"output_type":"stream", |
| 216 | +"text": [ |
| 217 | +"+-----+---+-----+----+-----------+\n", |
| 218 | +"| name|day|month|year| id|\n", |
| 219 | +"+-----+---+-----+----+-----------+\n", |
| 220 | +"|Abdul| 23| 5|1981| 0|\n", |
| 221 | +"| Ravi| 28| 1|2002| 8589934592|\n", |
| 222 | +"|Abdul| 23| 5|1981| 8589934593|\n", |
| 223 | +"| John| 12| 12|2006|17179869184|\n", |
| 224 | +"| Rosy| 7| 8|1963|17179869185|\n", |
| 225 | +"+-----+---+-----+----+-----------+\n", |
| 226 | +"\n" |
| 227 | + ] |
| 228 | + } |
| 229 | + ], |
| 230 | +"source": [ |
| 231 | +"df7 = df5.withColumn(\"year\",\\\n", |
| 232 | +" when(col(\"year\") < 21, col(\"year\") + 2000)\\\n", |
| 233 | +" .when(col(\"year\") < 100, col(\"year\") + 1900)\\\n", |
| 234 | +" .otherwise(col(\"year\")))\n", |
| 235 | +"df7.show()" |
| 236 | + ] |
| 237 | + }, |
| 238 | + { |
| 239 | +"cell_type":"code", |
| 240 | +"execution_count":16, |
| 241 | +"metadata": {}, |
| 242 | +"outputs": [ |
| 243 | + { |
| 244 | +"name":"stdout", |
| 245 | +"output_type":"stream", |
| 246 | +"text": [ |
| 247 | +"+-----+---+-----+----+-----------+----------+\n", |
| 248 | +"| name|day|month|year| id| dob|\n", |
| 249 | +"+-----+---+-----+----+-----------+----------+\n", |
| 250 | +"|Abdul| 23| 5|1981| 0|1981-05-23|\n", |
| 251 | +"| Ravi| 28| 1|2002| 8589934592|2002-01-28|\n", |
| 252 | +"|Abdul| 23| 5|1981| 8589934593|1981-05-23|\n", |
| 253 | +"| John| 12| 12|2006|17179869184|2006-12-12|\n", |
| 254 | +"| Rosy| 7| 8|1963|17179869185|1963-08-07|\n", |
| 255 | +"+-----+---+-----+----+-----------+----------+\n", |
| 256 | +"\n" |
| 257 | + ] |
| 258 | + } |
| 259 | + ], |
| 260 | +"source": [ |
| 261 | +"df8 = df7.withColumn(\"dob\", expr(\"to_date(concat(day,'/',month,'/',year), 'd/M/y')\"))\n", |
| 262 | +"df8.show()" |
| 263 | + ] |
| 264 | + }, |
| 265 | + { |
| 266 | +"cell_type":"code", |
| 267 | +"execution_count":20, |
| 268 | +"metadata": {}, |
| 269 | +"outputs": [ |
| 270 | + { |
| 271 | +"name":"stdout", |
| 272 | +"output_type":"stream", |
| 273 | +"text": [ |
| 274 | +"+-----+-----------+----------+\n", |
| 275 | +"| name| id| dob|\n", |
| 276 | +"+-----+-----------+----------+\n", |
| 277 | +"| Rosy|17179869185|1963-08-07|\n", |
| 278 | +"|Abdul| 0|1981-05-23|\n", |
| 279 | +"| Ravi| 8589934592|2002-01-28|\n", |
| 280 | +"| John|17179869184|2006-12-12|\n", |
| 281 | +"+-----+-----------+----------+\n", |
| 282 | +"\n" |
| 283 | + ] |
| 284 | + } |
| 285 | + ], |
| 286 | +"source": [ |
| 287 | +"df9 = df7.withColumn(\"dob\", to_date(expr(\"concat(day,'/',month,'/',year)\"), 'd/M/y'))\\\n", |
| 288 | +" .drop(\"day\",\"month\",\"year\")\\\n", |
| 289 | +" .dropDuplicates([\"name\",\"dob\"])\\\n", |
| 290 | +" .sort(expr(\"dob desc\"))\n", |
| 291 | +"df9.show()" |
| 292 | + ] |
| 293 | + } |
| 294 | + ], |
| 295 | +"metadata": { |
| 296 | +"kernelspec": { |
| 297 | +"display_name":"Python 3", |
| 298 | +"language":"python", |
| 299 | +"name":"python3" |
| 300 | + }, |
| 301 | +"language_info": { |
| 302 | +"codemirror_mode": { |
| 303 | +"name":"ipython", |
| 304 | +"version":3 |
| 305 | + }, |
| 306 | +"file_extension":".py", |
| 307 | +"mimetype":"text/x-python", |
| 308 | +"name":"python", |
| 309 | +"nbconvert_exporter":"python", |
| 310 | +"pygments_lexer":"ipython3", |
| 311 | +"version":"3.7.6" |
| 312 | + } |
| 313 | + }, |
| 314 | +"nbformat":4, |
| 315 | +"nbformat_minor":2 |
| 316 | +} |