!"#$%&' !%)* +,,- %& ./0#$ 1*0/)2# 33
453657
1"<#-2 1*0/)2#3
H&)#"=
1"<#-2 H&)#"=
4 H&)#"=
H&)#"=
HG/"#I&' +2;0I"&0; ,0)0 D%)* ?/0C*2 .R""/ U H&)#"=
HG/"#I&' 0&= Q"=2;%&' .)#
7
10/)<#%&' ,0)0 D%)* ?/0C*2 N;
35 !! 34 3T 3U 36 3V 3W
./0#$ J0-%C90)8&23 4&(5 :%%' &2 67.)8 ?''#2'0I&' ,0)0 D%)* K0%# +,,!#%I&' 0&= ,2/;"9%&' ./0#$ ?//;%C0I"&K0#0;;2; K#"C2--%&' %& ./0#$ ./0#$ +,, K2#-%-)2&C2 1"GG"& K0E2#&- %& ./0#$ ,0)0 K#"C2--%&' ./0#$ .LM 0&= ,0)0N#0G2-
3S
1"&C;<-%"&
H&'2-I&' .)#20G%&' ,0)0
%&'()&*+(,- %.(. /)01,''&23 4&(5 67.)8
1"<#-2 1"&C;<-%"&
8 1"/9#%'*) 4535:4536 1;"<=2#0> ?;; #%'*)- #2-2#@2=> A") )" B2 #2/#"= !!"#
!"#$%&' !%)* +,,<2 (5&' 15.7(,) =0+ 4&>> >,.)2 !
?04 :%%' .), 1),.(,- @)0A B>,' 0) -.(. &2 A,A0)=
!
?04 (0 5.2->, B>, @0)A.(' 4&(5 A+>C">&2, ),10)-'
!
?04 (0 +', '0A, .--&C02.> 07,).C02' 02 :%%'
8 1"/9#%'*) 4535:4536 1;"<=2#0> ?;; #%'*)- #2-2#@2=> A") )" B2 #2/#"= !!";
1*0/)2# X"/%C90)8&23 9&(5 :%%' &2 67.)8
%&'()&*+(,- %.(. /)01,''&23 4&(5 67.)8
!
E),.C23 :%%'
!
Y)*2# Z2&2#0; +,, Y/2#0I"&-
!
1"&C;<-%"&
!
O"G2D"#$[ K#"C2-- ,0)0 N%;2- D%)* ./0#$
8 1"/9#%'*) 4535:4536 1;"<=2#0> ?;; #%'*)- #2-2#@2=> A") )" B2 #2/#"= !!"D
+,,!
:%%' 1.2 50>- .2= (=7, 0@ ,>,A,2( – K#%G%I@2
)9/2-[ %&)2'2#-\ C*0#0C)2#-\ B"";20&-\ 2)C> – .2R<2&C2 )9/2-[ -)#%&'-\ ;%-)-\ 0##09-\ );2-\ =%C)-\ 2)C> ]%&C;<=%&' &2-)2= =0)0 )9/2-^ – .C0;0_`0@0 YBa2C)- ]%F -2#%0;%b0B;2^ – Q%c2= )9/2!
60A, (=7,' 0@ :%%' 5.G, .--&C02.> @+21C02.>&(= – K0%#
+,,– +,,- C"&-%-I&' "F d29:e0;<2 /0%#– ,"
8 1"/9#%'*) 4535:4536 1;"<=2#0> ?;; #%'*)- #2-2#@2=> A") )" B2 #2/#"= !!"F
1#20I&' +,,- N#"G 1";;2CI"&!
I0+ 1.2 1),.(, :%%' @)0A 10>>,1C02' &2'(,.- 0@ B>,' –
sc.parallelize( collection)
myData = ["Alice","Carlos","Frank","Barbara"] > myRdd = sc.parallelize(myData) > myRdd.take(2) ['Alice', 'Carlos'] >
!
J',@+> 45,2 – X2-I&' – Z2&2#0I&'
=0)0 /#"'#0GG0IC0;;9
– H&)2'#0I&'
8 1"/9#%'*) 4535:4536 1;"<=2#0> ?;; #%'*)- #2-2#@2=> A") )" B2 #2/#"= !!"H
1#20I&' +,,- F#"G N%;2- ]3^ !
L0) B>,"*.',- :%%'M +', SparkContext.textFile – ?CC2/)-
0 -%&';2 f;2\ 0 D%;=C0#= ;%-) "F f;2-\ "# 0 C"GG0:-2/0#0)2= ;%-) "F
f;2– Pc0G/;2-
sc.textFile("myfile.txt") – sc.textFile("mydata/*.log") – sc.textFile("myfile1.txt,myfile2.txt") –
– P0C* !
;%&2 %& )*2 f;2]-^ %- 0 -2/0#0)2 #2C"#= %& )*2 +,,
L&>,' .), ),@,),21,- *= .*'0>+(, 0) ),>.CG, J:< – ?B-";<)2
g+H[
file:/home/training/myfile.txt – hdfs://localhost/loudacre/myfile.txt myfile.txt – +2;0I@2 g+H ]<-2- =2F0<;) f;2 -9-)2G^[ –
8 1"/9#%'*) 4535:4536 1;"<=2#0> ?;; #%'*)- #2-2#@2=> A") )" B2 #2/#"= !!"K
1#20I&' +,,- F#"G N%;2- ]4^ !
textFile A.7' ,.15 >&2, &2 . B>, (0 . ',7.).(, :%% ,>,A,2( I've never seen a purple cow.\n I never hope to see one;\n But I can tell you, anyhow,\n I'd rather see than be one.\n
I've never seen a purple cow. I never hope to see one; But I can tell you, anyhow, I'd rather see than be one.
!
textFile 02>= 40)8' 4&(5 >&2,"-,>&A&(,- (,O( B>,'
!
95.( .*0+( 0(5,) @0)A.('P
8 1"/9#%'*) 4535:4536 1;"<=2#0> ?;; #%'*)- #2-2#@2=> A") )" B2 #2/#"= !!"N
H&/<) 0&= Y<)/<) N"#G0)- ]3^ !
67.)8 +',' ?.-007 InputFormat .2- OutputFormat R.G. 1>.'',' – ."G2
2c0G/;2- F#"G C"#2 O0=""/ – TextInputFormat _ TextOutputFormat h &2D;%&2 =2;%G%)2= )2c) f;2– SequenceInputFormat _ SequenceOutputFormat –
FixedLengthInputFormat
– Q0&9
%G/;2G2&)0I"&- 0@0%;0B;2 %& 0==%I"&0; ;%B#0#%2 AvroInputFormat _ AvroOutputFormat %& )*2 ?@#" – 2>'> ;%B#0#9
8 1"/9#%'*) 4535:4536 1;"<=2#0> ?;; #%'*)- #2-2#@2=> A") )" B2 #2/#"= !!"Q
H&/<) 0&= Y<)/<) N"#G0)- ]4^ !
67,1&@= .2= &27+( @0)A.( +'&23 sc.hadoopFile – "#
!
67,1&@= .2= 0+(7+( @0)A.( +'&23 rdd.saveAsHadoopFile – "#
!
newAPIhadoopFile F"# A2D ?KH C;0--2saveAsNewAPIhadoopFile F"# A2D ?KH C;0--2-
textFile .2- saveAsTextFile .), 102G,2&,21, @+21C02' – textFile a<-) C0;;- hadoopFile -/2C%F9%&' TextInputFormat – saveAsTextFile C0;;- saveAsHadoopFile -/2C%F9%&' TextOutputFormat
8 1"/9#%'*) 4535:4536 1;"<=2#0> ?;; #%'*)- #2-2#@2=> A") )" B2 #2/#"= !!"!S
!*";2 N%;2:J0-2= +,,- ]3^ !
sc.textFile A.7' ,.15 >&2, &2 . B>, (0 . ',7.).(, :%% ,>,A,2( – !*0)
0B"<) f;2- D%)* 0 G<;I:;%&2 %&/<) F"#G0)\ 2>'> iQM "# `.YAj
!
sc.wholeTextFiles( directory ) – Q0/-
2&I#2 C"&)2&)- "F 20C* f;2 %& 0 =%#2C)"#9 )" 0 -%&';2 +,, 2;2G2&) – !"#$- "&;9 F"# -G0;; f;2- ]2;2G2&) G<-) f) %& G2G"#9^
f;23>a-"&
{ "firstName":"Fred", "lastName":"Flintstone", "userid":"123" } f;24>a-"&
{ "firstName":"Barney", "lastName":"Rubble", "userid":"234” }
(file1.json,{"firstName":"Fred","lastName":"Flintstone","userid":"123"} ) (file2.json,{"firstName":"Barney","lastName":"Rubble","userid":”234"} ) (file3.xml,… ) (file4.xml,… )
8 1"/9#%'*) 4535:4536 1;"<=2#0> ?;; #%'*)- #2-2#@2=> A") )" B2 #2/#"= !!"!!
!*";2 N%;2:J0-2= +,,- ]4^ import json > myrdd1 = sc.wholeTextFiles( mydir ) > myrdd2 = myrdd1 .map(lambda (fname,s): json.loads(s)) > for record in myrdd2.take(2): print record["firstName"] > >
> > >
>
Y<)/<)[
Fred Barney
import scala.util.parsing.json.JSON val myrdd1 = sc.wholeTextFiles( mydir ) val myrdd2 = myrdd1 .map(pair => JSON.parseFull(pair._2).get. asInstanceOf[Map[String,String]]) for (record <- myrdd2.take(2)) println(record.getOrElse("firstName",null))
8 1"/9#%'*) 4535:4536 1;"<=2#0> ?;; #%'*)- #2-2#@2=> A") )" B2 #2/#"= !!"!#
1*0/)2# X"/%C90)8&23 9&(5 :%%' &2 67.)8
%&'()&*+(,- %.(. /)01,''&23 4&(5 67.)8
!
1#20I&' +,,-
!
T(5,) U,2,).> :%% T7,).C02'
!
1"&C;<-%"&
!
O"G2D"#$[ K#"C2-- ,0)0 N%;2- D%)* ./0#$
8 1"/9#%'*) 4535:4536 1;"<=2#0> ?;; #%'*)- #2-2#@2=> A") )" B2 #2/#"= !!"!;
."G2 Y)*2# Z2&2#0; +,, Y/2#0I"&!
6&23>,":%% V).2'@0)A.C02'
flatMap h G0/- "&2 2;2G2&) %& )*2 B0-2 +,, )" G<;I/;2 2;2G2&)– distinct h f;)2# "<) =;%C0)2– sortBy h <-2 /#"@%=2= F<&CI"& )" -"#) –
!
W+>C":%% V).2'@0)A.C02' –
intersection h C#20)2 0 &2D +,, D%)* 0;; 2;2G2&)- %& B")* "#%'%&0; +,,-
union h 0== 0;; 2;2G2&)- "F )D" +,,- %&)" 0 -%&';2 &2D +,, – zip h /0%# 20C* 2;2G2&) "F )*2 f#-) +,, D%)* )*2 C"##2-/"&=%&' –
2;2G2&) "F )*2 -2C"&=
8 1"/9#%'*) 4535:4536 1;"<=2#0> ?;; #%'*)- #2-2#@2=> A") )" B2 #2/#"= !!"!D
Pc0G/;2[ flatMap 0&= distinct >
sc.textFile(file) \ .flatMap(lambda line: line.split()) \ .distinct()
>
sc.textFile(file). flatMap(line => line.split(' ')). distinct()
K9)*"&
.C0;0
I've never seen a purple cow. I never hope to see one; But I can tell you, anyhow, I'd rather see than be one.
I’ve
I’ve
never
never
seen
seen
a
a
purple
purple
cow
cow
I
I
never
hope
hope
to
to
…
… 8 1"/9#%'*) 4535:4536 1;"<=2#0> ?;; #%'*)- #2-2#@2=> A") )" B2 #2/#"= !!"!F
Pc0G/;2-[ Q<;I:+,, X#0&-F"#G0I"&-
#==3
#==4
Chicago
San Francisco
Boston
Boston
Paris
Amsterdam
San Francisco
Mumbai
Tokyo
McMurdo Station
rdd1.subtract(rdd2)
rdd1.zip(rdd2)
rdd1.union(rdd2)
Chicago Boston Paris San Francisco Tokyo San Francisco Boston
Tokyo
(Chicago,San Francisco)
Amsterdam
Paris
(Boston,Boston)
Mumbai
Chicago
(Paris,Amsterdam)
McMurdo Station
(San Francisco,Mumbai) (Tokyo,McMurdo Station)
8 1"/9#%'*) 4535:4536 1;"<=2#0> ?;; #%'*)- #2-2#@2=> A") )" B2 #2/#"= !!"!H
."G2 Y)*2# Z2&2#0; +,, Y/2#0I"&!
T(5,) :%% 07,).C02'
first h #2)<#& )*2 f#-) 2;2G2&) "F )*2 +,, – foreach h 0//;9 0 F<&CI"& )" 20C* 2;2G2&) %& 0& +,, – top( n) h #2)<#& )*2 ;0#'2-) ! 2;2G2&)- <-%&' &0)<#0; "#=2#%&' –
!
6.A7>&23 07,).C02'
sample h C#20)2 0 &2D +,, D%)* 0 -0G/;%&' "F 2;2G2&)– takeSample h #2)<#& 0& 0##09 "F -0G/;2= 2;2G2&)–
!
%0+*>, :%% 07,).C02' – .)0I-IC0;
F<&CI"&-\ 2>'>\ mean\ sum \ variance \ stdev
8 1"/9#%'*) 4535:4536 1;"<=2#0> ?;; #%'*)- #2-2#@2=> A") )" B2 #2/#"= !!"!K
1*0/)2# X"/%C90)8&23 9&(5 :%%' &2 67.)8
%&'()&*+(,- %.(. /)01,''&23 4&(5 67.)8
!
1#20I&' +,,-
!
Y)*2# Z2&2#0; +,, Y/2#0I"&-
!
E021>+'&02
!
O"G2D"#$[ K#"C2-- ,0)0 N%;2- D%)* ./0#$
8 1"/9#%'*) 4535:4536 1;"<=2#0> ?;; #%'*)- #2-2#@2=> A") )" B2 #2/#"= !!"!N
P--2&I0; K"%&)!
:%%' 1.2 *, 1),.(,- @)0A B>,'M 7.).>>,>&X,- -.(. &2 A,A0)=M 0) 0(5,) :%%'
!
sc.textFile ),.-' 2,4>&2, -,>&A&(,- (,O(M 02, >&2, 7,) :%% ),10)-
!
sc.wholeTextFile ),.-' ,2C), B>,' &2(0 '&23>, :%% ),10)-'
!
U,2,)&1 :%%' 1.2 102'&'( 0@ .2= (=7, 0@ -.(.
!
U,2,)&1 :%%' 7)0G&-, . 4&-, ).23, 0@ ().2'@0)A.C02 07,).C02'
8 1"/9#%'*) 4535:4536 1;"<=2#0> ?;; #%'*)- #2-2#@2=> A") )" B2 #2/#"= !!"!Q
1*0/)2# X"/%C90)8&23 9&(5 :%%' &2 67.)8
%&'()&*+(,- %.(. /)01,''&23 4&(5 67.)8
!
1#20I&' +,,-
!
Y)*2# Z2&2#0; +,, Y/2#0I"&-
!
1"&C;<-%"&
!
?0A,40)8Y /)01,'' %.(. L&>,' 4&(5 67.)8
8 1"/9#%'*) 4535:4536 1;"<=2#0> ?;; #%'*)- #2-2#@2=> A") )" B2 #2/#"= !!"#S
O"G2D"#$[ K#"C2-- ,0)0 N%;2- D%)* ./0#$ !
<2 (5&' 50A,40)8 .''&32A,2( =0+ 4&>> – K#"C2--
0 -2) "F iQM f;2- <-%&' wholeTextFiles – +2F"#G0) 0 =0)0-2) )" -)0&=0#=%b2 F"#G0) ]B"&<-^ !
/>,.', ),@,) (0 (5, ?0A,40)8 -,'1)&7C02
8 1"/9#%'*) 4535:4536 1;"<=2#0> ?;; #%'*)- #2-2#@2=> A") )" B2 #2/#"= !!"#!