!""#$"%&'" )%*% +,*- .%,# /))0 1-%2*$# 34
453657
19=#0$ 1-%2*$#0 3
G'*#9>=C&9'
19=#0$ G'*#9>=C&9'
4 G'*#9>=C&9' *9 O%>992 %'> *-$ O%>992 PC90:0*$F O%>992 !#C-,*$C*=#$ %'> O)NH T
G'*#9>=C&9' *9 O%>992
GF29#&'" /$<%&9'%< )%*% +,*- !2%C-$ HR992 U G'*#9>=C&9' *9 GF2%<% %'> O,@$ 6 V Q9>$<,'" %'> Q%'%",'" )%*% +,*- GF2%<% %'> O,@$ )%*% N9#F%*0 W )%*% N,<$ .%#&&9','" S
GF29#&'" %'> Q9>$<,'" H*#=C*=#$> )%*%
7
1%2*=#,'" )%*% +,*- !2%C-$ N<=F$
35 33 !" 3T 3U 36 3V 3W
H2%#I J%0,C0 K9#I,'" +,*- /))0 ,' H2%#I 933),3.:23 %.(. 4&(5 /.&) ;%%' K#,&'" %'> )$2<9:,'" H2%#I !22<,C%&9'0 .%#%<<$< .#9C$00,'" ,' H2%#I H2%#I /)) .$#0,0*$'C$ 19FF9' .%D$#'0 ,' H2%#I )%*% .#9C$00,'" H2%#I HLM %'> )%*%N#%F$0
3S
19'C<=0,9'
G'"$0&'" H*#$%F,'" )%*%
%&'()&*+(,- %.(. /)01,''&23 4&(5 67.)8
19=#0$ 19'C<=0,9'
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#"
19=#0$ 1-%2*$#0 3
G'*#9>=C&9'
19=#0$ G'*#9>=C&9'
4 G'*#9>=C&9' *9 O%>992 %'> *-$ O%>992 PC90:0*$F O%>992 !#C-,*$C*=#$ %'> O)NH T
G'*#9>=C&9' *9 O%>992
GF29#&'" /$<%&9'%< )%*% +,*- !2%C-$ HR992 U G'*#9>=C&9' *9 GF2%<% %'> O,@$ 6 V Q9>$<,'" %'> Q%'%",'" )%*% +,*- GF2%<% %'> O,@$ )%*% N9#F%*0 W )%*% N,<$ .%#&&9','" S
GF29#&'" %'> Q9>$<,'" H*#=C*=#$> )%*%
7
1%2*=#,'" )%*% +,*- !2%C-$ N<=F$
35 33 !" 3T 3U 36 3V 3W
H2%#I J%0,C0 K9#I,'" +,*- /))0 ,' H2%#I 933),3.:23 %.(. 4&(5 /.&) ;%%' K#,&'" %'> )$2<9:,'" H2%#I !22<,C%&9'0 .%#%<<$< .#9C$00,'" ,' H2%#I H2%#I /)) .$#0,0*$'C$ 19FF9' .%D$#'0 ,' H2%#I )%*% .#9C$00,'" H2%#I HLM %'> )%*%N#%F$0
3S
19'C<=0,9'
G'"$0&'" H*#$%F,'" )%*%
%&'()&*+(,- %.(. /)01,''&23 4&(5 67.)8
19=#0$ 19'C<=0,9'
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#"
!""#$"%&'" )%*% +,*- .%,# /))0 =2 (5&' 15.7(,) >0+ 4&?? ?,.)2 !
@04 (0 1),.(, /.&) ;%%' 0A 8,>#B.?+, 7.&)' A)0C 3,2,)&1 ;%%'
!
67,1&.? 07,).:02' .B.&?.*?, 02 /.&) ;%%'
!
@04 C.7#),-+1, .?30)&(5C' .), &C7?,C,2(,- &2 67.)8
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#<
1-%2*$# X92,C0 933),3.:23 %.(. 4&(5 /.&) ;%%'
%&'()&*+(,- %.(. /)01,''&23 4&(5 67.)8
!
E,>#F.?+, /.&) ;%%'
!
Q%2;/$>=C$
!
Y*-$# .%,# /)) Y2$#%&9'0
!
19'C<=0,9'
!
O9F$+9#IZ [0$ .%,# /))0 *9 \9,' X+9 )%*%0$*0
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#D
.%,# /))0 !
!
/.&) ;%%' .), . '7,1&.? A0)C 0A ;%% – P%C- $<$F$'* F=0* B$ % I$:;@%<=$ 2%,# ]% *+9;$<$F$'* *=2<$^ – _$:0 %'> @%<=$0 C%' B$ %': *:2$ H5>I – [0$ +,*- F%2;#$>=C$ %<"9#,*-F0 – Q%': %>>,&9'%< E='C&9'0 %#$ %@%,<%B<$ E9# C9FF9' >%*% 2#9C$00,'" '$$>0 – $?"?` 09#&'"` a9,','"` "#9=2,'"` C9='&'"` $*C?
.%,# /)) (key1,value1) (key2,value2) (key3,value3) …
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#G
1#$%&'" .%,# /))0 !
!
K5, L)'( '(,7 &2 C0'( 40)8M04' &' (0 3,( (5, -.(. &2(0 8,>NB.?+, A0)C – K-%* 0-9=<> *-$ /)) 0-9=<> B$ I$:$> 9'b – K-%* ,0 *-$ @%<=$b O0CC02?> +',- A+21:02' (0 1),.(, /.&) ;%%'
– map – flatMap N flatMapValues – keyBy
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#J
Pc%F2<$Z ! H,F2<$ .%,# /)) !
QR.C7?,S O),.(, . /.&) ;%% A)0C . (.*#',7.).(,- L?, .:*-9'
> users = sc.textFile(file) \
.map(lambda line: line.split('\t')) \ .map(lambda fields: (fields[0],fields[1])) > val users = sc.textFile(file) \ HC%<%
.map(line => line.split('\t')) \ .map(fields => (fields(0),fields(1)))
user001\tFred Flintstone user090\tBugs Bunny user111\tHarry Potter …
(user001,Fred Flintstone) (user090,Bugs Bunny) (user111,Harry Potter) …
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#P
Pc%F2<$Z _$:,'" K$B M9"0 B: [0$# G)
.:*-9'
HC%<%
> sc.textFile(logfile) \
.keyBy(lambda line: line.split(' ')[2]) > sc.textFile(logfile) \
.keyBy(line => line.split(' ')(2))
[0$# G) 56.38.234.188 – 99788 "GET /KBDOC-00157.html HTTP/1.0" … 56.38.234.188 – 99788 "GET /theme.css HTTP/1.0" … 203.146.17.59 – 25254 "GET /KBDOC-00230.html HTTP/1.0" … …
(99788,56.38.234.188 – 99788 "GET /KBDOC-00157.html…) (99788,56.38.234.188 – 99788 "GET /theme.css…) (25254,203.146.17.59 – 25254 "GET /KBDOC-00230.html…) … 8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#T
L=$0&9' 3Z .%,#0 K,*- 19F2<$c e%<=$0 !
@04 40+?- >0+ -0 (5&'I – G'2=*Z % <,0* 9E 290*%< C9>$0 +,*- <%&*=>$ %'> <9'",*=>$ – Y=*2=*Z 290*%< C9>$ ]I$:^ %'> <%*d<9'" 2%,# ]@%<=$^
00210 00211 00212 00213 00214 V
43.005895 43.005895 43.005895 43.005895 43.005895
-71.013202 -71.013202 -71.013202 -71.013202 -71.013202
(00210,(43.005895,-71.013202)) (00211,(43.005895,-71.013202)) b
(00212,(43.005895,-71.013202)) (00213,(43.005895,-71.013202)) …
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#U
!'0+$# 3Z .%,#0 K,*- 19F2<$c e%<=$0 > sc.textFile( file) \
.map(lambda line: line.split()) \ .map(lambda fields: (fields[0],(fields[1],fields[2]))) > sc.textFile( file).
map(line => line.split('\t')). map(fields => (fields(0),(fields(1),fields(2)))) 00210 01014 01062 01263 V
43.005895 42.170731 42.324232 42.3929
-71.013202 -72.604842 -72.67915 -73.228483
(00210,(43.005895,-71.013202)) (01014,(42.170731,-72.604842)) (01062,(42.324232,-72.67915)) (01263,(42.3929,-73.228483)) …
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#!W
L=$0&9' 4Z Q%22,'" H,'"<$ /9+0 *9 Q=<&2<$ .%,#0 ]3^ !
@04 40+?- >0+ -0 (5&'I – G'2=*Z 9#>$# '=FB$#0 +,*- % <,0* 9E H_[0 ,' *-$ 9#>$# – Y=*2=*Z 9#>$# ]I$:^ %'> 0I= ]@%<=$^ .%,# /))
G'2=* )%*%
00001 00002 00003 00004
sku010:sku933:sku022 sku912:sku331 sku888:sku022:sku010:sku594 sku411
(00001,sku010) (00001,sku933) b
(00001,sku022) (00002,sku912) (00002,sku331) (00003,sku888) …
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#!!
L=$0&9' 4Z Q%22,'" H,'"<$ /9+0 *9 Q=<&2<$ .%,#0 ]4^ !
@&2(S map .?02, 402X( 40)8
00001 00002 00003 00004
sku010:sku933:sku022 sku912:sku331 sku888:sku022:sku010:sku594 sku411
(00001,(sku010,sku933,sku022)) (00002,(sku912,sku331)) (00003,(sku888,sku022,sku010,sku594)) (00004,(sku411))
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#!"
!'0+$# 4Z Q%22,'" H,'"<$ /9+0 *9 Q=<&2<$ .%,#0 ]3^ > sc.textFile(file)
00001
sku010:sku933:sku022
00002
sku912:sku331
00003
sku888:sku022:sku010:sku594
00004
sku411
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#!<
!'0+$# 4Z Q%22,'" H,'"<$ /9+0 *9 Q=<&2<$ .%,#0 ]4^ > sc.textFile(file) \
.map(lambda line: line.split('\t'))
00001
sku010:sku933:sku022
00002
sku912:sku331
00003 sku888:sku022:sku010:sku594 [00001,sku010:sku933:sku022] 00004 sku411 [00002,sku912:sku331] [00003,sku888:sku022:sku010:sku594] [00004,sku411]
A9*$ *-%* split #$*=#'0 4;$<$F$'* %##%:0` '9* 2%,#0d*=2<$0
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#!D
!'0+$# 4Z Q%22,'" H,'"<$ /9+0 *9 Q=<&2<$ .%,#0 ]T^ > sc.textFile(file) \
.map(lambda line: line.split('\t')) \ .map(lambda fields: (fields[0],fields[1]))
00001
sku010:sku933:sku022
00002
sku912:sku331
00003 sku888:sku022:sku010:sku594 [00001,sku010:sku933:sku022] 00004 sku411 [00002,sku912:sku331] [00003,sku888:sku022:sku010:sku594] (00001,sku010:sku933:sku022) [00004,sku411] (00002,sku912:sku331) (00003,sku888:sku022:sku010:sku594) (00004,sku411)
Q%2 %##%: $<$F$'*0 *9 *=2<$0 *9 2#9>=C$ % .%,# /))
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#!G
!'0+$# 4Z Q%22,'" H,'"<$ /9+0 *9 Q=<&2<$ .%,#0 ]U^ > sc.textFile(file) \
.map(lambda line: line.split('\t')) \ .map(lambda fields: (fields[0],fields[1])) .flatMapValues(lambda skus: skus.split(':')) 00001
sku010:sku933:sku022
00002
sku912:sku331
00003 sku888:sku022:sku010:sku594 [00001,sku010:sku933:sku022] 00004 sku411 [00002,sku912:sku331] [00003,sku888:sku022:sku010:sku594] (00001,sku010:sku933:sku022) [00004,sku411] (00002,sku912:sku331) (00003,sku888:sku022:sku010:sku594) (00004,sku411)
(00001,sku010) (00001,sku933) (00001,sku022) (00002,sku912) (00002,sku331) (00003,sku888) …
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#!J
1-%2*$# X92,C0 933),3.:23 %.(. 4&(5 /.&) ;%%'
%&'()&*+(,- %.(. /)01,''&23 4&(5 67.)8
!
_$:;e%<=$ .%,# /))0
!
Y.7#;,-+1,
!
Y*-$# .%,# /)) Y2$#%&9'0
!
19'C<=0,9'
!
O9F$+9#IZ [0$ .%,# /))0 *9 \9,' X+9 )%*%0$*0
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#!P
Q%2;/$>=C$ !
!
!
Y.7#),-+1, &' . 10CC02 7)03).CC&23 C0-,? – P%0,<: %22<,C%B<$ *9 >,0*#,B=*$> 2#9C$00,'" 9E <%#"$ >%*% 0$*0 @.-007 Y.7;,-+1, &' (5, C.Z0) &C7?,C,2(.:02 – H9F$+-%* <,F,*$> – P%C- a9B -%0 9'$ Q%2 2-%0$` 9'$ /$>=C$ 2-%0$ – \9B 9=*2=* ,0 0%@$> *9 f<$0 67.)8 &C7?,C,2(' C.7#),-+1, 4&(5 C+15 3),.(,) M,R&*&?&(> – Q%2 %'> #$>=C$ E='C&9'0 C%' B$ ,'*$#02$#0$> – /$0=<*0 C%' B$ 0*9#$> ,' F$F9#: – Y2$#%&9'0 C%' $%0,<: B$ C-%,'$>
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#!T
Q%2;/$>=C$ ,' H2%#I !
!
!
Y.7#),-+1, &2 67.)8 40)8' 02 /.&) ;%%' Y.7 75.', – Y2$#%*$0 9' 9'$ #$C9#> %* % &F$ – gQ%20h $%C- #$C9#> *9 9'$ 9# F9#$ '$+ #$C9#>0 – $?"? map[ flatMap [ filter [ keyBy ;,-+1, 75.', – K9#I0 9' F%2 9=*2=* – 19'09<,>%*$0 F=<&2<$ #$C9#>0 [ – $?"? reduceByKey [ sortByKey mean
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#!U
Q%2;/$>=C$ Pc%F2<$Z K9#> 19='* /$0=<* G'2=* )%*%
the cat sat on the mat the aardvark sat on the sofa
b
aardvark
1
cat
1
mat
1
on
2
sat
2
sofa
1
the
4
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#"W
Pc%F2<$Z K9#> 19='* ]3^
> counts = sc.textFile(file)
the cat sat on the mat the aardvark sat on the sofa
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#"!
Pc%F2<$Z K9#> 19='* ]4^
> counts = sc.textFile(file) \
.flatMap(lambda line: line.split())
the cat sat on the mat the aardvark sat on the sofa
the cat sat on the mat the aardvark …
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#""
Pc%F2<$Z K9#> 19='* ]T^
> counts = sc.textFile(file) \
.flatMap(lambda line: line.split()) \ .map(lambda word: ( word ,1))
the cat sat on the mat the aardvark sat on the sofa
_$:; e%<=$ .%,#0
the
(the, 1)
cat
(cat, 1)
sat
(sat, 1)
on
(on, 1)
the
(the, 1)
mat
mat, 1) (
the
(the, 1)
aardvark
(aardvark, 1)
…
…
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#"<
Pc%F2<$Z K9#> 19='* ]U^
> counts = sc.textFile(file) \
.flatMap(lambda line: line.split()) \ .map(lambda word: (word,1)) \ .reduceByKey(lambda v1,v2: v1+v2)
the cat sat on the mat the aardvark sat on the sofa
the
(the, 1)
(aardvark, 1)
cat
(cat, 1)
(cat, 1)
sat
(sat, 1)
mat, 1) (
on
(on, 1)
(on, 2)
the
(the, 1)
(sat, 2)
mat
mat, 1) (
(sofa, 1)
the
(the, 1)
(the, 4)
aardvark
(aardvark, 1)
…
…
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#"D
/$>=C$J:_$: ]3^ !
K5, A+21:02 7.'',- (0 reduceByKey 10C*&2,' B.?+,' A)0C (40 8,>' – N='C&9' F=0* B$ B,'%#:
> counts = sc.textFile(file) \
.flatMap(lambda line: line.split()) \ .map(lambda word: (word,1)) \ .reduceByKey(lambda v1,v2: v1+v2)
(the,1) (cat,1)
(the,2)
(sat,1)
(on,2) (sofa,1)
(on,1)
(the,3)
(the,1) (mat,1)
(mat,1)
(the,4)
(the,1)
(aardvark,1) (the, 4)
(aardvark,1)
(cat,1)
(sat,1)
(sat,2)
(on,1) (the,1)
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#"G
/$>=C$J:_$: ]4^ !
K5, A+21:02 C&35( *, 1.??,- &2 .2> 0)-,)[ (5,),A0), C+'( *, – 19FF=*%&@$ i cj: k :jc – !009C,%&@$ i ]cj:^jl k cj]:jl^
> counts = sc.textFile(file) \
.flatMap(lambda line: line.split()) \ .map(lambda word: (word,1)) \ .reduceByKey(lambda v1,v2: v1+v2)
(the,1) (cat,1)
(the,2)
(sat,1)
(on,2)
(on,1)
(sofa,1)
(the,1)
(mat,1)
(mat,1)
(the,4)
(the,1) (aardvark,1) (sat,1)
(the,2)
(aardvark,1) (the, 4) (cat,1) (sat,2)
(on,1) (the,1)
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#"J
K9#> 19='* /$C%2 ]*-$ HC%<% e$#0,9'^
> val counts = sc.textFile(file).
flatMap(line => line.split("\\W")). map(word => (word ,1)). reduceByKey((v1,v2) => v1+v2) Y/
> val counts = sc.textFile(file).
flatMap(_.split("\\W")). map(( _ ,1)). reduceByKey( _+_ )
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#"P
K-: )9 K$ 1%#$ !B9=* 19='&'" K9#>0b !
!
!
!
H0)- 10+2( &' 15.??,23&23 0B,) C.''&B, .C0+2(' 0A -.(. – [0,'" % 0,'"<$ C9F2=*$ '9>$ +9=<> B$ *99 &F$;C9'0=F,'" – A=FB$# 9E =',R=$ +9#>0 C9=<> $cC$$> %@%,<%B<$ F$F9#: 6(.:':1' .), 0\,2 '&C7?, .33),3.(, A+21:02' – ),0*#,B=&@$ ,' '%*=#$ – $?"?` F%c` F,'` 0=F` C9='* Y.7#),-+1, *),.8' 10C7?,R (.'8' -042 &2(0 'C.??,) ,?,C,2(' 45&15 1.2 *, ,R,1+(,- &2 7.).??,? Y.2> 10CC02 (.'8' .), B,)> '&C&?.) (0 40)- 10+2( – $?"?` <9" f<$ %'%<:0,0
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#"T
1-%2*$# X92,C0 933),3.:23 %.(. 4&(5 /.&) ;%%'
%&'()&*+(,- %.(. /)01,''&23 4&(5 67.)8
!
_$:;e%<=$ .%,# /))0
!
Q%2;/$>=C$
!
](5,) /.&) ;%% ]7,).:02'
!
19'C<=0,9'
!
O9F$+9#IZ [0$ .%,# /))0 *9 \9,' X+9 )%*%0$*0
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#"U
.%,# /)) Y2$#%&9'0 !
!
=2 .--&:02 (0 map .2- reduce A+21:02'[ 67.)8 5.' ',B,).? 07,).:02' '7,1&L1 (0 /.&) ;%%' QR.C7?,'
– countByKey i #$*=#' % F%2 +,*- *-$ C9='* 9E 9CC=##$'C$0 9E $%CI$:
– groupByKey i "#9=2 %<< *-$ @%<=$0 E9# $%C- I$: ,' %' /)) – sortByKey i 09#* ,' %0C$'>,'" 9# >$0C$'>,'" 9#>$# – join i #$*=#' %' /)) C9'*%,','" %<< 2%,#0 +,*- F%*C-,'" I$:0 E#9F *+9 /))0
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#
Pc%F2<$Z .%,# /)) Y2$#%&9'0
(00004,sku411) (00003,sku888)
(00001,sku010)
) e ( s y l e a K F = By g t n r i o d (00001,sku022) s n e sc a (00001,sku933) (00002,sku912) (00002,sku331) (00003,sku888)
(00003,sku022) (00003,sku010) (00003,sku594) (00002,sku912) …
… (00002,[sku912,sku331]) (00001,[sku022,sku010,sku933]) (00003,[sku888,sku022,sku010,sku594]) (00004,[sku411])
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#
Pc%F2<$Z \9,','" B: _$: > movies = moviegross.join(movieyear) /))Z moviegross
/))Z movieyear
(Casablanca,$3.7M )
(Casablanca,1942)
(Star Wars,$775M )
(Star Wars,1977)
(Annie Hall,$38M )
Annie Hall,1977) (
(Argo,$232M )
Argo,2012) (
…
…
(Casablanca,($3.7M,1942)) (Star Wars,($775M,1977)) Annie Hall,($38M,1977)) ( Argo,($232M,2012)) ( … 8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#<"
[0,'" \9,' !
9 10CC02 7)03).CC&23 7.^,)2 3? Q%2 0$2%#%*$ >%*%0$*0 ,'*9 I$:;@%<=$ .%,# /))0 4? \9,' B: I$: T? Q%2 a9,'$> >%*% ,'*9 *-$ >$0,#$> E9#F%* U? H%@$` >,02<%:` 9# C9'&'=$ 2#9C$00,'"m
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#<<
Pc%F2<$Z \9,' K$B M9" K,*- _'9+<$>"$ J%0$ !#&C<$0 ]3^
+$B<9"0 56.38.234.188 56.38.234.188 203.146.17.59 221.78.60.155 65.187.255.81 …
– – – – –
99788 99788 25254 45402 14242
"GET "GET "GET "GET "GET
/KBDOC-00157.html HTTP/1.0" … /theme.css HTTP/1.0" … /KBDOC-00230.html HTTP/1.0" … /titanic_4000_sales.html HTTP/1.0" … /KBDOC-00107.html HTTP/1.0" … /$R=$0*$> N,<$
[0$# G) a9,'
IB<,0* KBDOC-00157:Ronin Novelty Note 3 - Back up files KBDOC-00230:Sorrento F33L - Transfer Contacts KBDOC-00050:Titanic 1000 - Transfer Contacts KBDOC-00107:MeeToo 5.0 - Transfer Contacts KBDOC-00300:iFruit 5A – overheats … !#&C<$ G)
!#&C<$ X,*<$
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#
Pc%F2<$Z \9,' K$B M9" K,*- _'9+<$>"$ J%0$ !#&C<$0 ]4^ !
6(,7' 3? Q%2 0$2%#%*$ >%*%0$*0 ,'*9 I$:;@%<=$ .%,# /))0 %? Q%2 +$B <9" #$R=$0*0 *9 (docid ,userid ) ,title) B? Q%2 _J )9C ,'>$c *9 (docid 4? \9,' B: I$:Z docid ,title) T? Q%2 a9,'$> >%*% ,'*9 *-$ >$0,#$> E9#F%*Z (userid U? N=#*-$# 2#9C$00,'"Z "#9=2 &*<$0 B: [0$# G)
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#
H*$2 3%Z Q%2 K$B M9" /$R=$0*0 *9 (docid,userid) > import re > def getRequestDoc(s):
return re.search(r'KBDOC-[0-9]*',s).group() > kbreqs = sc.textFile( logfile) \
.filter(lambda line: 'KBDOC-' in line) \ .map(lambda line: (getRequestDoc(line),line.split(' ')[2])) \ .distinct() 56.38.234.188 56.38.234.188 203.146.17.59 221.78.60.155 65.187.255.81 …
– – – – –
99788 99788 25254 45402 14242
"GET "GET "GET "GET "GET
/KBDOC-00157.html HTTP/1.0" … /theme.css HTTP/1.0" … /KBDOC-00230.html HTTP/1.0" … /titanic_4000_sales.html HTTP/1.0" IB#$R0 … /KBDOC-00107.html HTTP/1.0" …
(KBDOC-00157,99788) (KBDOC-00203,25254) (KBDOC-00107,14242) …
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#
H*$2 3BZ Q%2 _J G'>$c *9 (docid,title) > kblist = sc.textFile(kblistfile) \
.map(lambda line: line.split(':')) \ .map(lambda fields: (fields[0],fields[1])) KBDOC-00157:Ronin Novelty Note 3 - Back up files KBDOC-00230:Sorrento F33L - Transfer Contacts KBDOC-00050:Titanic 1000 - Transfer Contacts KBDOC-00107:MeeToo 5.0 - Transfer Contacts KBDOC-00206:iFruit 5A – overheats …
IB<,0*
(KBDOC-00157,Ronin Novelty Note 3 - Back up files ) (KBDOC-00230,Sorrento F33L - Transfer Contacts ) (KBDOC-00050,Titanic 1000 - Transfer Contacts ) (KBDOC-00107, MeeToo 5.0 - Transfer Contacts) … 8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#
H*$2 4Z \9,' J: _$: docid
> titlereqs = kbreqs.join(kblist)
IB#$R0
IB<,0*
(KBDOC-00157,99788)
(KBDOC-00157,Ronin Novelty Note 3 - Back up files)
(KBDOC-00230,25254)
(KBDOC-00230,Sorrento F33L - Transfer Contacts)
(KBDOC-00107,14242)
(KBDOC-00050,Titanic 1000 - Transfer Contacts)
…
(KBDOC-00107, MeeToo 5.0 - Transfer Contacts) …
(KBDOC-00157,(99788,Ronin Novelty Note 3 - Back up files)) (KBDOC-00230,(25254,Sorrento F33L - Transfer Contacts)) (KBDOC-00107,(14242,MeeToo 5.0 - Transfer Contacts)) …
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#
H*$2 TZ Q%2 /$0=<* *9 )$0,#$> N9#F%* (userid,title)
> titlereqs = kbreqs.join(kblist) \
.map(lambda (docid ,(userid,title)): (userid ,title))
(KBDOC-00157,(99788,Ronin Novelty Note 3 - Back up files)) (KBDOC-00230,(25254,Sorrento F33L - Transfer Contacts)) (KBDOC-00107,(14242,MeeToo 5.0 - Transfer Contacts)) …
(99788,Ronin Novelty Note 3 - Back up files) (25254,Sorrento F33L - Transfer Contacts) (14242,MeeToo 5.0 - Transfer Contacts) …
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#
H*$2 UZ 19'&'=$ .#9C$00,'" i n#9=2 X,*<$0 B: [0$# G)
> titlereqs = kbreqs.join(kblist) \
.map(lambda (docid,(userid,title)): (userid,title)) \ .groupByKey() (99788,Ronin Novelty Note 3 - Back up files) (25254,Sorrento F33L - Transfer Contacts) (14242, MeeToo 5.0 - Transfer Contacts) …
(99788,[Ronin Novelty Note 3 - Back up files, Ronin S3 - overheating]) (25254,[Sorrento F33L - Transfer Contacts])
A9*$Z @%<=$0 %#$ "#9=2$> ,'*9 G*$#%B<$0
(14242,[MeeToo MeeToo iFruit MeeToo
5.0 5.1 1 3.1
- Transfer Contacts, - Back up files, Back up files, - Transfer Contacts])
…
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#DW
Pc%F2<$ Y=*2=*
> for (userid,titles) in titlereqs.take(10):
print 'user id: ',userid for title in titles: print '\t',title user id: 99788 Ronin Novelty Note 3 - Back up files (99788,[Ronin Novelty Note 3 - Back up files, Ronin S3 – overheating Ronin S3 - overheating]) user id: 25254 (25254,[Sorrento F33L - Transfer Contacts]) Sorrento F33L - Transfer Contacts (14242,[MeeToo 5.0 - Transfer Contacts, user id: 14242 MeeToo 5.1 - Back up files, iFruit 1 - Back up files, MeeToo 5.0 - Transfer Contacts MeeToo 3.1 - Transfer Contacts]) MeeToo 5.1 - Back up files … iFruit 1 - Back up files MeeToo 3.1 - Transfer Contacts …
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#D!
!0,>$Z !'9':F9=0 N='C&9' .%#%F$*$#0 !
/>(502 .2- 61.?. 7.^,)2 C.(15&23 1.2 5,?7 &C7)0B, 10-, ),.-.*&?&(> ,(userid,title)): (userid ,title)) > map(lambda (docid
.:*-9'
HC%<%
(pair._2._1, pair._2._2)) > map(pair =>
Y/ > map{case (docid ,(userid,title)) => (userid ,title)}
(KBDOC-00157,(99788,…title…))
(99788,…title…)
(KBDOC-00230,(25254,…title…))
(25254,…title…)
(KBDOC-00107,(14242,…title…))
(14242,…title…)
…
…
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#D"
Y*-$# .%,# Y2$#%&9'0 !
!
60C, 0(5,) 7.&) 07,).:02' – keys i #$*=#' %' /)) 9E a=0* *-$ I$:0` +,*-9=* *-$ @%<=$0 – values i #$*=#' %' /)) 9E a=0* *-$ @%<=$0` +,*-9=* I$:0 – lookup( key ) i #$*=#' *-$ @%<=$]0^ E9# % I$: – leftOuterJoin ` rightOuterJoin ` fullOuterJoin i a9,'` ,'C<=>,'" I$:0 >$f'$> ,' *-$ <$o` #,"-* 9# $,*-$# /)) #$02$C&@$<: – mapValues ` flatMapValues i $c$C=*$ % E='C&9' 9' a=0* *-$ @%<=$0` I$$2,'" *-$ I$: *-$ 0%F$ 6,, (5, PairRDDFunctions 1?.'' 61.?.-01 A0) . A+?? ?&'(
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#D<
1-%2*$# X92,C0 933),3.:23 %.(. 4&(5 /.&) ;%%'
%&'()&*+(,- %.(. /)01,''&23 4&(5 67.)8
!
_$:;e%<=$ .%,# /))0
!
Q%2;/$>=C$
!
Y*-$# .%,# /)) Y2$#%&9'0
!
O021?+'&02
!
O9F$+9#IZ [0$ .%,# /))0 *9 \9,' X+9 )%*%0$*0
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#DD
P00$'&%< .9,'*0 !
/.&) ;%%' .), . '7,1&.? A0)C 0A ;%% 102'&':23 0A E,>#F.?+, 7.&)' _(+7?,'`
!
67.)8 7)0B&-,' ',B,).? 07,).:02' A0) 40)8&23 4&(5 /.&) ;%%'
!
Y.7#),-+1, &' . 3,2,)&1 7)03).CC&23 C0-,? A0) -&'()&*+(,- 7)01,''&23 – H2%#I ,F2<$F$'*0 F%2;#$>=C$ +,*- .%,# /))0 – O%>992 Q%2/$>=C$ %'> 9*-$# ,F2<$F$'*%&9'0 %#$ <,F,*$> *9 % 0,'"<$ F%2 %'> 0,'"<$ #$>=C$ 2-%0$ 2$# a9B – H2%#I %<<9+0 p$c,B<$ C-%,','" 9E F%2 %'> #$>=C$ 92$#%&9'0 – H2%#I 2#9@,>$0 92$#%&9'0 *9 $%0,<: 2$#E9#F C9FF9' F%2;#$>=C$ %<"9#,*-F0 <,I$ a9,','"` 09#&'"` %'> "#9=2,'"
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#DG
1-%2*$# X92,C0 933),3.:23 %.(. 4&(5 /.&) ;%%'
%&'()&*+(,- %.(. /)01,''&23 4&(5 67.)8
!
_$:;e%<=$ .%,# /))0
!
Q%2;/$>=C$
!
Y*-$# .%,# /)) Y2$#%&9'0
!
19'C<=0,9'
!
@0C,40)8S a', /.&) ;%%' (0 b0&2 K40 %.(.',('
8 192:#,"-* 4535;4536 1<9=>$#%? !<< #,"-*0 #$0$#@$>? A9* *9 B$ #$2#9>=C$> 9# 0-%#$> +,*-9=* 2#,9# +#,D$' C9'0$'* E#9F 1<9=>$#%? !"#DJ