!"#$% !'( #)* +#,#-$#./0 12#",/$ 34
563768
1:@$0/ 12#",/$0 3
J),$:*@FK:)
1:@$0/ J),$:*@FK:)
5 J),$:*@FK:) ,: L#*::" #)* ,2/ L#*::" MF:0;0,/. L#*::" B$F2<,/F,@$/ #)* L+-! U
J),$:*@FK:) ,: L#*::"
J.":$K)= O/?#K:)#? +#,# G<,2 B"#F2/ !P::" V J),$:*@FK:) ,: J."#?# #)* L
J.":$K)= #)* N:*/?<)= !,$@F,@$/* +#,#
8
1#",@$<)= +#,# G<,2 B"#F2/ -?@./
36 33 35 3U 3V 37 3W !"
!"#$% R#0
3T
1:)F?@0<:)
J)=/0K)= !,$/#.<)= +#,#
&'()*'+,)-. &/)/ 0*12-(('34 5')6 78/*9
1:@$0/ 1:)F?@0<:)
9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$
1:@$0/ 12#",/$0 3
J),$:*@FK:)
1:@$0/ J),$:*@FK:)
5 J),$:*@FK:) ,: L#*::" #)* ,2/ L#*::" MF:0;0,/. L#*::" B$F2<,/F,@$/ #)* L+-! U
J),$:*@FK:) ,: L#*::"
J.":$K)= O/?#K:)#? +#,# G<,2 B"#F2/ !P::" V J),$:*@FK:) ,: J."#?# #)* L
J.":$K)= #)* N:*/?<)= !,$@F,@$/* +#,#
8
1#",@$<)= +#,# G<,2 B"#F2/ -?@./
36 33 35 3U 3V 37 3W !"
!"#$% R#0
3T
1:)F?@0<:)
J)=/0K)= !,$/#.<)= +#,#
&'()*'+,)-. &/)/ 0*12-(('34 5')6 78/*9
1:@$0/ 1:)F?@0<:)
9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$
+#,#-$#./0 #)* !"#$%!'( ?3 )6'( 26/8)-* @1, 5'AA A-/*3 !
B6/) 78/*9 7:; '(
!
B6/) C-/),*-( )6- &/)/<*/=- D0? 8*1E'.-(
!
F15 )1 2*-/)- / 7:;G13)-H)
!
F15 )1 A1/. -H'(I34 ./)/ '3)1 / &/)/<*/=-
!
F15 )1 J,-*@ ./)/ '3 / &/)/<*/=-
!
F15 )1 213E-*) C*1= &/)/<*/=-( )1 0/'* K&&(
9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$
12#",/$ X:"
&'()*'+,)-. &/)/ 0*12-(('34 5')6 78/*9
!
78/*9 7:; /3. )6- 7:; G13)-H)
!
1$/#K)= +#,#-$#./0
!
X$#)0I:$.<)= #)* '@/$;<)= +#,#-$#./0
!
!#C<)= +#,#-$#./0
!
+#,#-$#./0 #)* O++0
!
1:."#$<)= !"#$% !'(Y J."#?# #)* L:)>!"#
%$!
1:)F?@0<:)
!
L:./G:$%Z [0/ !"#$% !'( I:$ MX(
9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$
S2#, <0 !"#$% !'(d !
B6/) '( 78/*9 7:;N – !"#
%$.:*@?/ I:$ 0,$@F,@$/* *#,# "$:F/00<)= – O/"?#F/0 !2#$% \# "$<:$ !"#$% .:*@?/Y ):G */"$/F#,/*] – R@, :) ,:" :I F:$/ !"#
%$!
B6/) .1-( 78/*9 7:; 8*1E'.-N – X2/
+#,#-$#./ BQJ ^ # ?
9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$
!'( 1:),/c, !
P6- =/'3 78/*9 7:; -3)*@ 81'3) '( / 7:; G13)-H) 1+Q-2) – O/P@<$/0
# !"#$%1:),/c, – X2/ !'( 1:),/c, <) !"#$% !'( <0 0<.#$ ,: !"#$% 1:),/c, <) F:$/ !"#$% !
P6-*- /*- )51 '=8A-=-3)/I13( –
SQLContext – E#0
–
HiveContext – O/#*0
#)* G$<,/0 L
9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$
1$/#K)= # !'( 1:),/c, !
7:;G13)-H) '( 2*-/)-. +/(-. 13 )6- 78/*9G13)-H)
Q;,2:)
from pyspark.sql import SQLContext sqlCtx = SQLContext(sc)
!F#?#
import org.apache.spark.sql.SQLContext val sqlCtx = new SQLContext(sc) import sqlCtx._
9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$
12#",/$ X:"
&'()*'+,)-. &/)/ 0*12-(('34 5')6 78/*9
!
!"#$% !'( #)* ,2/ !'( 1:),/c,
!
G*-/I34 &/)/<*/=-(
!
X$#)0I:$.<)= #)* '@/$;<)= +#,#-$#./0
!
!#C<)= +#,#-$#./0
!
+#,#-$#./0 #)* O++0
!
1:."#$<)= !"#$% !'(Y J."#?# #)* L:)>!"#
%$!
1:)F?@0<:)
!
L:./G:$%Z [0/ !"#$% !'( I:$ MX(
9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$
+#,#-$#./0 !
&/)/<*/=-( /*- )6- =/'3 /+()*/2I13 '3 78/*9 7:; – B)#?:=:@0
,: O++0 <) F:$/ !"#$% – B *<0,$
9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$
1$/#K)= +#,#-$#./0 !
&/)/<*/=-( 2/3 +- 2*-/)-. – -$:.
#) /c<0K)= 0,$@F,@$/* *#,# 0:@$F/ \Q#$P@/, _?/Y g!aD _?/Y /,FA] – -$:. #) /c<0K)= O++ – R; "/$I:$.<)= #) :"/$#K:) :$ P@/$; :) #):,2/$ +#,#-$#./ – R; "$:=$#..#KF#??; */_)<)= # 0F2/.#
9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$
Mc#."?/Z 1$/#K)= # +#,#-$#./ I$:. # g!aD -/ from pyspark.sql import SQLContext Q;,2:) sqlCtx = SQLContext(sc) peopleDF = sqlCtx.jsonFile("people.json")
!F#?#
val sqlCtx = new SQLContext(sc) import sqlCtx._ val peopleDF = sqlCtx.jsonFile("people.json")
-/Z "/:"?/Af0:)
{"name":"Alice", "pcode":"94304"} {"name":"Brayden", "age":30, "pcode":"94304"} {"name":"Carla", "age":19, "pcode":"10036"} {"name":"Diana", "age":46} {"name":"Étienne", "pcode":"94104"}
age null 30 19 46 null
name Alice Brayden Carla Diana Étienne
pcode 94304 94304 10036 null 94104
9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$
1$/#K)= # +#,#-$#./ I$:. # +#,# !:@$F/ !
U-)61.( 13 )6- 7:;G13)-H) 1+Q-2)
!
G13E-3'-32- C,32I13(
jsonFile(filename) parquetFile(filename) – –
!
V-3-*'2 +/(- C,32I13W load –
load(filename,source) ^ ?:#* filename :I ,;"/ source \*/I#@?, Q#$P@/,]
–
load(source,options…) ^ ?:#* I$:. # 0:@$F/ :I ,;"/ source
@0<)= :"K:)0 – 1:)C/))F/ I@)FK:)0 #$/ <."?/./),/* E; F#??<)= load – jsonFile("people.json") h load("people.json",
"json")
9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$
+#,# !:@$F/0 !
78/*9 7:; !X> '32A,.-( )6*-- ./)/ (1,*2- )@8-(
json parquet – – jdbc –
!
Y1, 2/3 /A(1 ,(- )6'*. 8/*)@ ./)/ (1,*2- A'+*/*'-(Z (,26 /( – BC$: – LR#0/ – 1!i – N;!'( – #)*
.:$/ E/<)= #**/* #?? ,2/ K./
9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$
j/)/$
[H/=8A-W ;1/.'34 C*1= / U@7:; ./)/+/(-
val accountsDF = sqlCtx.load("jdbc", Map("url"-> "jdbc:mysql://dbhost/dbname?user=…&password=…", "dbtable" -> "accounts")) accountsDF = sqlCtx.load(source="jdbc", \ url="jdbc:mysql://dbhost/dbname?user=…&password=…", \ dbtable="accounts”) B/*3'34Z BC:<* *<$/F, #FF/00 ,: *#,#E#0/0 <) "$:*@FK:) /)C<$:)./),0Y G2
9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$
j/)/$"#$,; :$ 1@0,:. !:@$F/0 !
!
Y1, 2/3 /A(1 ,(- 2,()1= 1* )6'*. 8/*)@ ./)/ (1,*2-( [H/=8A-W K-/. C*1= /3 DE*1 \A- ,('34 )6- avro (1,*2- '3 )6- &/)/+*'29( 78/*9 DE*1 8/29/4-
$ spark-shell --packages com.databricks:spark-avro_2.10:1.0.0 > … > val myDF = sqlCtx.load("myfile.avro","com.databricks.spark.avro")
$ pyspark --packages com.databricks:spark-avro_2.10:1.0.0 > … > myDF = sqlCtx.load("myfile.avro","com.databricks.spark.avro")
9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$
12#",/$ X:"
&'()*'+,)-. &/)/ 0*12-(('34 5')6 78/*9
!
!"#$% !'( #)* ,2/ !'( 1:),/c,
!
1$/#K)= +#,#-$#./0
!
P*/3(C1*='34 /3. :,-*@'34 &/)/<*/=-(
!
!#C<)= +#,#-$#./0
!
+#,#-$#./0 #)* O++0
!
1:."#$<)= !"#$% !'(Y J."#?# #)* L:)>!"#
%$!
1:)F?@0<:)
!
L:./G:$%Z [0/ !"#$% !'( I:$ MX(
9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$
+#,#-$#./ R#0
]/('2 ^8-*/I13( .-/A 5')6 &/)/<*/=- =-)/./)/ _*/)6-* )6/3 ')( ./)/`Z -X4X
schema ^ $/,@$)0 # !F2/.# :Ef/F, */0F$)#./Y,;"/] "#<$0 – explain ^ "$<),0 */E@= <)I:$.#K:) #E:@, ,2/ +#,#-$#./ ,: ,2/ –
F:)0:?/
9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$
+#,#-$#./ R#0
[H/=8A-W &'(8A/@'34 21A,=3 ./)/ )@8-( ,('34 dtypes
> peopleDF = sqlCtx.jsonFile("people.json") > for item in peopleDF.dtypes(): print item ('age', 'bigint') ('name', 'string') ('pcode', 'string’)
> val peopleDF = sqlCtx.jsonFile("people.json") > people.dtypes.foreach(println) (age,LongType) (name,StringType) (pcode,StringType)
9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$
S:$%<)= G<,2 +#,# <) # +#,#-$#./ !
:,-*'-( a 2*-/)- / 3-5 &/)/<*/=– +#,#-$#./0
#$/ <..@,#E?/ – '@/$0 #$/ #)#?:=:@0 ,: O++ ,$#)0I:$.#K:)0 !
D2I13( a *-),*3 ./)/ )1 )6- &*'E-* – BFK:)0
,$<==/$ k?#b;l /c/F@K:) :I P@/$0
9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$
+#,#-$#./ BFK:)0 !
71=- &/)/<*/=- /2I13(
collect ^ $/,@$) #?? $:G0 #0 #) #$$#; :I Row :Ef/F,0 – take(n) ^ $/,@$) ,2/ _$0, n $:G0 #0 #) #$$#; :I Row :Ef/F,0 – count ^ $/,@$) ,2/ )@.E/$ :I $:G0 – show(n) ^ *<0"?#; ,2/ _$0, n $:G0 \*/I#@?,h56] –
>
peopleDF.count() 5L
>
peopleDF.show(3) age name pcode null Alice 94304 30 Brayden 94304 19 Carla 10036
>
>
peopleDF.count() res7: Long = 5 peopleDF.show(3) age name pcode null Alice 94304 30 Brayden 94304 19 Carla 10036
9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$
+#,#-$#./ '@/$0 \3] !
&/)/<*/=- J,-*@ =-)61.( *-),*3 3-5 &/)/<*/=-( – '@/$0
!
F#) E/ F2#<)/* ?<%/ ,$#)0I:$.#K:)0
71=- J,-*@ =-)61.(
distinct ^ $/,@$)0 # )/G +#,#-$#./ G<,2 *<0K)F, /?/./),0 :I ,2<0 +– join ^ f:<)0 ,2<0 +#,#-$#./ G<,2 # 0/F:)* +#,#-$#./ –
– 0/C/$#?
C#$<#),0 I:$ <)0<*/Y :@,0<*/Y ?/mY $<=2,Y /,FA – limit ^ # )/G +- G<,2 ,2/ _$0, n $:G0 :I ,2<0 +#,#-$#./ – select ^ # )/G +#,#-$#./ G<,2 *#,# I$:. :)/ :$ .:$/ F:?@.)0 :I ,2/ E#0/ +#,#-$#./ – filter ^ # )/G +#,#-$#./ G<,2 $:G0 .//K)= # 0"/F<_/* F:)*
9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$
+#,#-$#./ '@/$0 \5] !
[H/=8A-W D +/('2 J,-*@ 5')6 A'=')
>
peopleDF.limit(3).show
>
peopleDF.limit(3).show()
a@,"@, :I show
age null 30 19
name Alice Brayden Carla
pcode 94304 94304 10036
age null 30 19 46 null
name Alice Brayden Carla Diana Étienne
pcode 94304 94304 10036 null 94104
age null 30 19
name Alice Brayden Carla
pcode 94304 94304 10036
9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$
+#,#-$#./ '@/$; !,$<)=0 \3] !
71=- J,-*@ 18-*/I13( )/9- ()*'34( 213)/'3'34 ('=8A- J,-*@ -H8*-(('13( – !@F2
!
age null
#0 select #)* where
30
[H/=8A-W select
age null 30 19 46 null
name Alice Brayden Carla Diana Étienne
19 46
DF. e ) l p o ge" e a p " ( t lec e s
pcode 94304 94304 10036 p e op leD null F. selec t( "na 94104 m e","a g e"
)
null name
age
Alice
null
Brayden
30
Carla
19
Diana
46
Étienne
null
9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$
+#,#-$#./ '@/$; !,$<)=0 \5] !
[H/=8A-W where
peopleDF. where("age > 21") age null 30 19 46 null
name Alice Brayden Carla Diana Étienne
pcode 94304 94304 10036 null 94104
age 30 46
name pcode Brayden 94304 Diana null
9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$
'@/$;<)= +#,#-$#./0 @0<)= 1:?@.)0 \3] !
71=- &< J,-*'-( )/9- 13- 1* =1*- !"#$%&' 1* !"#$%& )*+,)''-"&' – O/P@<$/*
!
I:$ .:$/ 0:"2<0KF#,/* :"/$#K:)0
71=- -H/=8A-(
select – sort – join where – –
9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$
'@/$;<)= +#,#-$#./0 @0<)= 1:?@.)0 \5] !
?3 0@)613Z *-C-*-32- 21A,=3( +@ 3/=- ,('34 ."/ &"/01"&
ageDF = peopleDF.select(peopleDF.age)
!
?3 72/A/Z 21A,=3( 2/3 +- *-C-*-32-. '3 )51 5/@(
val ageDF = peopleDF.select($"age")
age
name
pcode
null Alice
94304
30
Brayden
94304
19
Carla
10036
46
Diana
null
null Étienne
94104
– !"
val ageDF = peopleDF.select(peopleDF("age"))
age null 30 19 46 null
9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$
'@/$;<)= +#,#-$#./0 @0<)= 1:?@.)0 \U] !
G1A,=3 *-C-*-32-( 2/3 /A(1 +- !"#$%& )*+,)''-"&'
peopleDF.select(peopleDF.name,peopleDF.age+10)
peopleDF.select(peopleDF("name"),peopleDF("age")+10)
age null 30 19 46 null
name Alice Brayden Carla Diana Étienne
pcode 94304 94304 10036 null 94104
name Alice Brayden Carla Diana Étienne
age+10 null 40 29 56 null
9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$
'@/$;<)= +#,#-$#./0 @0<)= 1:?@.)0 \V] !
[H/=8A-W 71*I34 '3 +@ 21A,=3( _.-(2-3.'34`
.asc
peopleDF.sort(peopleDF.age.desc())
#)* .desc #$/ F:?@.) /c"$/00<:) ./,2:*0 @0/* G<,2
sort peopleDF.sort(peopleDF("age").desc) age null 30 19 46 null
name Alice Brayden Carla Diana Étienne
pcode 94304 94304 10036 null 94104
age 46 30 19 null null
name Diana Brayden Carla Alice Étienne
pcode null 94304 10036 94304 94104
9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$
!'( '@/$0 !
78/*9 7:; /A(1 (,881*)( )6- /+'A')@ )1 8-*C1*= 7:; J,-*'-( – -<$0,Y
$/=<0,/$ ,2/ +#,#-$#./ #0 # k,#E?/l G<,2 ,2/ !'( 1:),/c,
peopleDF.registerTempTable("people") sqlCtx.sql("""SELECT * FROM people WHERE name LIKE "A%" """)
peopleDF.registerTempTable("people") sqlCtx.sql("""SELECT * FROM people WHERE name LIKE "A%" """) age null 30 19 46 null
name Alice Brayden Carla Diana Étienne
pcode 94304 94304 10036 null 94104
age null
name Alice
pcode 94304
9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$
12#",/$ X:"
&'()*'+,)-. &/)/ 0*12-(('34 5')6 78/*9
!
!"#$% !'( #)* ,2/ !'( 1:),/c,
!
1$/#K)= +#,#-$#./0
!
X$#)0I:$.<)= #)* '@/$;<)= +#,#-$#./0
!
7/E'34 &/)/<*/=-(
!
+#,#-$#./0 #)* O++0
!
1:."#$<)= !"#$% !'(Y J."#?# #)* L:)>!"#
%$!
1:)F?@0<:)
!
L:./G:$%Z [0/ !"#$% !'( I:$ MX(
9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$T
!#C<)= +#,#-$#./0 !
&/)/ '3 &/)/<*/=-( 2/3 +- (/E-. )1 / ./)/ (1,*2– R@,
<) 0@"":$, I:$ g+R1 #)* Q#$P@/, -/ – createJDBCTable ^ F$/#,/ # )/G ,#E?/ <) # *#,#E#0/ – insertInto ^ 0#C/ ,: #) /c<0K)= ,#E?/ <) # *#,#E#0/ – saveAsParquetFile ^ 0#C/ #0 # Q#$P@/, _?/ \<)F?@*<)= 0F2/.#] – saveAsTable ^ 0#C/ #0 # L
9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$!
12#",/$ X:"
&'()*'+,)-. &/)/ 0*12-(('34 5')6 78/*9
!
!"#$% !'( #)* ,2/ !'( 1:),/c,
!
1$/#K)= +#,#-$#./0
!
X$#)0I:$.<)= #)* '@/$;<)= +#,#-$#./0
!
!#C<)= +#,#-$#./0
!
&/)/<*/=-( /3. K&&(
!
1:."#$<)= !"#$% !'(Y J."#?# #)* L:)>!"#
%$!
1:)F?@0<:)
!
L:./G:$%Z [0/ !"#$% !'( I:$ MX(
9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$$
+#,#-$#./0 #)* O++0 \3] !
&/)/<*/=-( /*- +,'A) 13 K&&(
O++0 F:),#<) Row :Ef/F,0 ,: =/, ,2/ @)*/$?;<)= O++ – [0/ rdd – R#0/
peopleRDD = peopleDF.rdd
peopleDF age null 30 19 46 null
name Alice Brayden Carla Diana Étienne
peopleRDD pcode 94304 94304 10036 null 94104
Row[null,Alice,94304] Row[30,Brayden,94304] Row[19,Carla,10036] Row[46,Diana,null] Row[null,Étienne,94104]
9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$>
+#,#-$#./0 #)* O++0 \5] !
K15 K&&( 6/E- /AA )6- ()/3./*. 78/*9 /2I13( /3. )*/3(C1*=/I13(
^ collectY takeY countY /,FA mapY flatMapY filterY /,FA – X$#)0I:$.#K:)0 ^ – BFK:)0
!
K15 K&&( 2/3 +- )*/3(C1*=-. '3)1 0/'*K&&( )1 ,(- =/8#*-.,2- =-)61.(
9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$L
S:$%<)= G<,2 O:G aEf/F,0 !
P6- (@3)/H C1* -H)*/2I34 ./)/ C*1= K15( .-8-3.( 13 A/34,/4-
!
0@)613 – 1:?@.)
)#./0 #$/ :Ef/F, #H$
!
72/A/ – [0/
B$$#;>?<%/ 0;),#c – row(0) ^ $/,@$)0 /?/./), <) ,2/ _$0, F:?@.) – row(1) ^ $/,@$) /?/./), <) ,2/ 0/F:)* F:?@.) – /,FA – [0/ ,;"/>0"/F<_F get ./,2:*0 ,: $/,@$) ,;"/* C#?@/0 ,2 – row.getString(n) ^ $/,@$)0 ) F:?@.) #0 # !,$<)= ,2 – row.getInt(n) ^ $/,@$)0 ) F:?@.) #0 #) J),/=/$ – /,FA 9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$M
Mc#."?/Z Mc,$#FK)= +#,# I$:. O:G0 !
[H)*/2) ./)/ C*1= K15(
Row[null,Alice,94304] Row[30,Brayden,94304] Row[19,Carla,10036]
peopleRDD = peopleDF.rdd peopleByPCode = peopleRDD \ .map(lambda row(row.pcode,row.name)) \ .groupByKey()
Row[46,Diana,null] Row[null,Étienne,94104] (94304, Alice) (94304,Brayden) (10036,Carla)
val peopleRDD = peopleDF.rdd peopleByPCode = peopleRDD. map(row => (row(2),row(1))). groupByKey())
(null,Diana) (94104,Étienne) (null,[Diana]) (94304,[Alice,Brayden]) (10036,[Carla]) (94104,[Étienne])
9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$O
1:)C/$K)= O++0 ,: +#,#-$#./0 !
Y1, 2/3 /A(1 2*-/)- / &< C*1= /3 K&& –
sqlCtx.createDataFrame(rdd )
9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$"
12#",/$ X:"
&'()*'+,)-. &/)/ 0*12-(('34 5')6 78/*9
!
!"#$% !'( #)* ,2/ !'( 1:),/c,
!
1$/#K)= +#,#-$#./0
!
X$#)0I:$.<)= #)* '@/$;<)= +#,#-$#./0
!
!#C<)= +#,#-$#./0
!
+#,#-$#./0 #)* O++0
!
G1=8/*'34 78/*9 7:;Z ?=8/A/ /3. F'E-#13#78/*9
!
1:)F?@0<:)
!
L:./G:$%Z [0/ !"#$% !'( I:$ MX(
9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$R
1:."#$<)= J."#?# ,: !"#$% !'( !
78/*9 7:; '( +,'A) 13 78/*9Z / 2)&),0# +$,+"') 8*12-(('34 -34'3– Q$:C<*/0
F:)C/)), !'(>?<%/ #FF/00 ,: 0,$@F,@$/* *#,# <) # !"#$% #""?
!
?=8/A/ '( / '+)!-0#-3). 7:; -34'3– N@F2
E/H/$ "/$I:$.#)F/ I:$ P@/$;<)= – N@F2 .:$/ .#,@$/ ,2#) !"#$% !'( – O:E@0, 0/F@$<,; C<# !/),$; !
?=8/A/ '( +-b-* C1* – J),/$#FKC/
P@/$0 – +#,# #)#?;0<0 !
c(- 78/*9 7:; C1* – MX( – BFF/00
,: 0,$@F,@$/* *#,# $/P@<$/* E; # !"#$% #""?
9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$S
1:."#$<)= !"#$% !'( G<,2 L
78/*9 7:; – Q$:C<*/0
,2/ +#,#-$#./ BQJ ,: #??:G 0,$@F,@$/* *#,# "$:F/00<)= $% & '(&)* &((+$,&-.% – Q$:=$#../$0 F#) .
F'E-#13#78/*9 – L
"$:C<*/0 # !'( #E0,$#FK:) ?#;/$ :C/$ N#"O/*@F/ :$ !"#$% – B??:G0 ):)>"$:=$#../$0 ,: #)#?;b/ *#,# @0<)= I#.<#$ !'( – L:)>!"#$% $/"?#F/0 N#"O/*@F/ #0 ,2/ /)=<)/ @)*/$?;<)= L
9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$
12#",/$ X:"
&'()*'+,)-. &/)/ 0*12-(('34 5')6 78/*9
!
!"#$% !'( #)* ,2/ !'( 1:),/c,
!
1$/#K)= +#,#-$#./0
!
X$#)0I:$.<)= #)* '@/$;<)= +#,#-$#./0
!
!#C<)= +#,#-$#./0
!
+#,#-$#./0 #)* O++0
!
1:."#$<)= !"#$% !'(Y J."#?# #)* L:)>!"#
%$!
G132A,('13
!
L:./G:$%Z [0/ !"#$% !'( I:$ MX(
9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$
M00/)K#? Q:<),0 !
78/*9 7:; '( / 78/*9 D0? C1* 6/3.A'34 ()*,2),*-. /3. (-='#()*,2),*-. ./)/
!
[3)*@ 81'3) '( / 7:;G13)-H)
!
&/)/<*/=-( /*- )6- 9-@ ,3') 1C ./)/
!
&/)/<*/=-( /*- +/(-. 13 /3 ,3.-*A@'34 K&& 1C K15 1+Q-2)(
!
!
!
&/)/<*/=-( J,-*@ =-)61.( *-),*3 3-5 &/)/<*/=-(d ('='A/* )1 K&& )*/3(C1*=/I13( P6- C,AA 78/*9 D0? 2/3 +- ,(-. 5')6 78/*9 7:; &/)/ +@ /22-(('34 )6,3.-*A@'34 K&& 78/*9 7:; '( 31) / *-8A/2-=-3) C1* / ./)/+/(-Z 1* / (8-2'/A'e-. 7:; -34'3A'9- ?=8/A/ – !"#
%$!'( <0 .:0, @0/I@? I:$ MX( :$ <)F:$":$#K)= 0,$@F,@$/* *#,# <),: :,2/$ #""?5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$
12#",/$ X:"
&'()*'+,)-. &/)/ 0*12-(('34 5')6 78/*9
!
!"#$% !'( #)* ,2/ !'( 1:),/c,
!
1$/#K)= +#,#-$#./0
!
X$#)0I:$.<)= #)* '@/$;<)= +#,#-$#./0
!
!#C<)= +#,#-$#./0
!
+#,#-$#./0 #)* O++0
!
1:."#$<)= !"#$% !'(Y J."#?# #)* L:)>!"#
%$!
1:)F?@0<:)
!
F1=-51*9W c(- 78/*9 7:; C1* [P;
9 1:";$<=2, 5636>5637 1?:@*/$#A B?? $<=2,0 $/0/$C/*A D:, ,: E/ $/"$:*@F/* :$ 02#$/* G<,2:@, "$<:$ G$