Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit455fcef

Browse files
Initial Commit
1 parente3d6509 commit455fcef

File tree

1 file changed

+316
-0
lines changed

1 file changed

+316
-0
lines changed

‎12-MiscDemoNotebook/MiscDemo.ipynb‎

Lines changed: 316 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,316 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type":"code",
5+
"execution_count":5,
6+
"metadata": {},
7+
"outputs": [
8+
{
9+
"name":"stdout",
10+
"output_type":"stream",
11+
"text": [
12+
"root\n",
13+
" |-- name: string (nullable = true)\n",
14+
" |-- day: string (nullable = true)\n",
15+
" |-- month: string (nullable = true)\n",
16+
" |-- year: string (nullable = true)\n",
17+
"\n"
18+
]
19+
}
20+
],
21+
"source": [
22+
"import findspark\n",
23+
"findspark.init()\n",
24+
"\n",
25+
"import pyspark\n",
26+
"from pyspark.sql import *\n",
27+
"from pyspark.sql.functions import *\n",
28+
"from pyspark.sql.types import *\n",
29+
"\n",
30+
"spark = SparkSession.builder\\\n",
31+
" .master(\"local[3]\")\\\n",
32+
" .appName(\"MiscDemo\")\\\n",
33+
" .getOrCreate()\n",
34+
"\n",
35+
"data_list = [(\"Ravi\",\"28\",\"1\",\"2002\"),\n",
36+
" (\"Abdul\",\"23\",\"5\",\"81\"), # 1981\n",
37+
" (\"John\",\"12\",\"12\",\"6\"), # 2006\n",
38+
" (\"Rosy\",\"7\",\"8\",\"63\"), # 1963\n",
39+
" (\"Abdul\",\"23\",\"5\",\"81\") # 1981\n",
40+
" ]\n",
41+
"raw_df = spark.createDataFrame(data_list).toDF(\"name\",\"day\",\"month\",\"year\").repartition(3)\n",
42+
"raw_df.printSchema()"
43+
]
44+
},
45+
{
46+
"cell_type":"code",
47+
"execution_count":7,
48+
"metadata": {},
49+
"outputs": [
50+
{
51+
"name":"stdout",
52+
"output_type":"stream",
53+
"text": [
54+
"+-----+---+-----+----+-----------+\n",
55+
"| name|day|month|year| id|\n",
56+
"+-----+---+-----+----+-----------+\n",
57+
"|Abdul| 23| 5| 81| 0|\n",
58+
"| Ravi| 28| 1|2002| 8589934592|\n",
59+
"|Abdul| 23| 5| 81| 8589934593|\n",
60+
"| John| 12| 12| 6|17179869184|\n",
61+
"| Rosy| 7| 8| 63|17179869185|\n",
62+
"+-----+---+-----+----+-----------+\n",
63+
"\n"
64+
]
65+
}
66+
],
67+
"source": [
68+
"df1 = raw_df.withColumn(\"id\", monotonically_increasing_id())\n",
69+
"df1.show()"
70+
]
71+
},
72+
{
73+
"cell_type":"code",
74+
"execution_count":8,
75+
"metadata": {},
76+
"outputs": [
77+
{
78+
"name":"stdout",
79+
"output_type":"stream",
80+
"text": [
81+
"+-----+---+-----+------+-----------+\n",
82+
"| name|day|month| year| id|\n",
83+
"+-----+---+-----+------+-----------+\n",
84+
"|Abdul| 23| 5|1981.0| 0|\n",
85+
"| Ravi| 28| 1| 2002| 8589934592|\n",
86+
"|Abdul| 23| 5|1981.0| 8589934593|\n",
87+
"| John| 12| 12|2006.0|17179869184|\n",
88+
"| Rosy| 7| 8|1963.0|17179869185|\n",
89+
"+-----+---+-----+------+-----------+\n",
90+
"\n"
91+
]
92+
}
93+
],
94+
"source": [
95+
"df2 = df1.withColumn(\"year\", expr(\"\"\"\n",
96+
" case when year < 21 then year + 2000\n",
97+
" when year < 100 then year + 1900\n",
98+
" else year\n",
99+
" end\"\"\"))\n",
100+
"df2.show()"
101+
]
102+
},
103+
{
104+
"cell_type":"code",
105+
"execution_count":9,
106+
"metadata": {},
107+
"outputs": [
108+
{
109+
"name":"stdout",
110+
"output_type":"stream",
111+
"text": [
112+
"+-----+---+-----+----+-----------+\n",
113+
"| name|day|month|year| id|\n",
114+
"+-----+---+-----+----+-----------+\n",
115+
"|Abdul| 23| 5|1981| 0|\n",
116+
"| Ravi| 28| 1|2002| 8589934592|\n",
117+
"|Abdul| 23| 5|1981| 8589934593|\n",
118+
"| John| 12| 12|2006|17179869184|\n",
119+
"| Rosy| 7| 8|1963|17179869185|\n",
120+
"+-----+---+-----+----+-----------+\n",
121+
"\n"
122+
]
123+
}
124+
],
125+
"source": [
126+
"df3 = df1.withColumn(\"year\", expr(\"\"\"\n",
127+
" case when year < 21 then cast(year as int) + 2000\n",
128+
" when year < 100 then cast(year as int) + 1900\n",
129+
" else year\n",
130+
" end\"\"\"))\n",
131+
"df3.show()"
132+
]
133+
},
134+
{
135+
"cell_type":"code",
136+
"execution_count":11,
137+
"metadata": {},
138+
"outputs": [
139+
{
140+
"name":"stdout",
141+
"output_type":"stream",
142+
"text": [
143+
"+-----+---+-----+----+-----------+\n",
144+
"| name|day|month|year| id|\n",
145+
"+-----+---+-----+----+-----------+\n",
146+
"|Abdul| 23| 5|1981| 0|\n",
147+
"| Ravi| 28| 1|2002| 8589934592|\n",
148+
"|Abdul| 23| 5|1981| 8589934593|\n",
149+
"| John| 12| 12|2006|17179869184|\n",
150+
"| Rosy| 7| 8|1963|17179869185|\n",
151+
"+-----+---+-----+----+-----------+\n",
152+
"\n",
153+
"root\n",
154+
" |-- name: string (nullable = true)\n",
155+
" |-- day: string (nullable = true)\n",
156+
" |-- month: string (nullable = true)\n",
157+
" |-- year: integer (nullable = true)\n",
158+
" |-- id: long (nullable = false)\n",
159+
"\n"
160+
]
161+
}
162+
],
163+
"source": [
164+
"df4 = df1.withColumn(\"year\", expr(\"\"\"\n",
165+
" case when year < 21 then year + 2000\n",
166+
" when year < 100 then year + 1900\n",
167+
" else year\n",
168+
" end\"\"\").cast(IntegerType()))\n",
169+
"df4.show()\n",
170+
"df4.printSchema()"
171+
]
172+
},
173+
{
174+
"cell_type":"code",
175+
"execution_count":14,
176+
"metadata": {},
177+
"outputs": [
178+
{
179+
"name":"stdout",
180+
"output_type":"stream",
181+
"text": [
182+
"+-----+---+-----+----+-----------+\n",
183+
"| name|day|month|year| id|\n",
184+
"+-----+---+-----+----+-----------+\n",
185+
"|Abdul| 23| 5|1981| 0|\n",
186+
"| Ravi| 28| 1|2002| 8589934592|\n",
187+
"|Abdul| 23| 5|1981| 8589934593|\n",
188+
"| John| 12| 12|2006|17179869184|\n",
189+
"| Rosy| 7| 8|1963|17179869185|\n",
190+
"+-----+---+-----+----+-----------+\n",
191+
"\n"
192+
]
193+
}
194+
],
195+
"source": [
196+
"df5 = df1.withColumn(\"day\", col(\"day\").cast(IntegerType()))\\\n",
197+
" .withColumn(\"month\", col(\"month\").cast(IntegerType()))\\\n",
198+
" .withColumn(\"year\", col(\"year\").cast(IntegerType()))\n",
199+
"\n",
200+
"df6 = df5.withColumn(\"year\", expr(\"\"\"\n",
201+
" case when year < 21 then year + 2000\n",
202+
" when year < 100 then year + 1900\n",
203+
" else year\n",
204+
" end\"\"\"))\n",
205+
"df6.show()"
206+
]
207+
},
208+
{
209+
"cell_type":"code",
210+
"execution_count":15,
211+
"metadata": {},
212+
"outputs": [
213+
{
214+
"name":"stdout",
215+
"output_type":"stream",
216+
"text": [
217+
"+-----+---+-----+----+-----------+\n",
218+
"| name|day|month|year| id|\n",
219+
"+-----+---+-----+----+-----------+\n",
220+
"|Abdul| 23| 5|1981| 0|\n",
221+
"| Ravi| 28| 1|2002| 8589934592|\n",
222+
"|Abdul| 23| 5|1981| 8589934593|\n",
223+
"| John| 12| 12|2006|17179869184|\n",
224+
"| Rosy| 7| 8|1963|17179869185|\n",
225+
"+-----+---+-----+----+-----------+\n",
226+
"\n"
227+
]
228+
}
229+
],
230+
"source": [
231+
"df7 = df5.withColumn(\"year\",\\\n",
232+
" when(col(\"year\") < 21, col(\"year\") + 2000)\\\n",
233+
" .when(col(\"year\") < 100, col(\"year\") + 1900)\\\n",
234+
" .otherwise(col(\"year\")))\n",
235+
"df7.show()"
236+
]
237+
},
238+
{
239+
"cell_type":"code",
240+
"execution_count":16,
241+
"metadata": {},
242+
"outputs": [
243+
{
244+
"name":"stdout",
245+
"output_type":"stream",
246+
"text": [
247+
"+-----+---+-----+----+-----------+----------+\n",
248+
"| name|day|month|year| id| dob|\n",
249+
"+-----+---+-----+----+-----------+----------+\n",
250+
"|Abdul| 23| 5|1981| 0|1981-05-23|\n",
251+
"| Ravi| 28| 1|2002| 8589934592|2002-01-28|\n",
252+
"|Abdul| 23| 5|1981| 8589934593|1981-05-23|\n",
253+
"| John| 12| 12|2006|17179869184|2006-12-12|\n",
254+
"| Rosy| 7| 8|1963|17179869185|1963-08-07|\n",
255+
"+-----+---+-----+----+-----------+----------+\n",
256+
"\n"
257+
]
258+
}
259+
],
260+
"source": [
261+
"df8 = df7.withColumn(\"dob\", expr(\"to_date(concat(day,'/',month,'/',year), 'd/M/y')\"))\n",
262+
"df8.show()"
263+
]
264+
},
265+
{
266+
"cell_type":"code",
267+
"execution_count":20,
268+
"metadata": {},
269+
"outputs": [
270+
{
271+
"name":"stdout",
272+
"output_type":"stream",
273+
"text": [
274+
"+-----+-----------+----------+\n",
275+
"| name| id| dob|\n",
276+
"+-----+-----------+----------+\n",
277+
"| Rosy|17179869185|1963-08-07|\n",
278+
"|Abdul| 0|1981-05-23|\n",
279+
"| Ravi| 8589934592|2002-01-28|\n",
280+
"| John|17179869184|2006-12-12|\n",
281+
"+-----+-----------+----------+\n",
282+
"\n"
283+
]
284+
}
285+
],
286+
"source": [
287+
"df9 = df7.withColumn(\"dob\", to_date(expr(\"concat(day,'/',month,'/',year)\"), 'd/M/y'))\\\n",
288+
" .drop(\"day\",\"month\",\"year\")\\\n",
289+
" .dropDuplicates([\"name\",\"dob\"])\\\n",
290+
" .sort(expr(\"dob desc\"))\n",
291+
"df9.show()"
292+
]
293+
}
294+
],
295+
"metadata": {
296+
"kernelspec": {
297+
"display_name":"Python 3",
298+
"language":"python",
299+
"name":"python3"
300+
},
301+
"language_info": {
302+
"codemirror_mode": {
303+
"name":"ipython",
304+
"version":3
305+
},
306+
"file_extension":".py",
307+
"mimetype":"text/x-python",
308+
"name":"python",
309+
"nbconvert_exporter":"python",
310+
"pygments_lexer":"ipython3",
311+
"version":"3.7.6"
312+
}
313+
},
314+
"nbformat":4,
315+
"nbformat_minor":2
316+
}

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp