207 lines
6.4 KiB
Plaintext
207 lines
6.4 KiB
Plaintext
|
### Task 1 ###
|
||
|
|
||
|
hive> select count(*) from flights where month = 4;
|
||
|
|
||
|
stage-stage-1: map: 4 reduce: 1 cumulative cpu: 21.83 sec hdfs read: 534149548 hdfs write: 7 success
|
||
|
total mapreduce cpu time spent: 21 seconds 830 msec
|
||
|
ok
|
||
|
436007
|
||
|
time taken: 27.307 seconds, fetched: 1 row(s)
|
||
|
|
||
|
|
||
|
|
||
|
hive> select count(*) from flights where month = 11 and dayofmonth =6;
|
||
|
|
||
|
Stage-Stage-1: Map: 4 Reduce: 1 Cumulative CPU: 21.41 sec HDFS Read: 534150190 HDFS Write: 6 SUCCESS
|
||
|
Total MapReduce CPU Time Spent: 21 seconds 410 msec
|
||
|
OK
|
||
|
15092
|
||
|
Time taken: 27.366 seconds, Fetched: 1 row(s)
|
||
|
|
||
|
|
||
|
|
||
|
hive> select count(*) from flights where month = 8 and dayofmonth > 9 and dayofmonth < 21;
|
||
|
|
||
|
Stage-Stage-1: Map: 4 Reduce: 1 Cumulative CPU: 18.9 sec HDFS Read: 534151190 HDFS Write: 7 SUCCESS
|
||
|
Total MapReduce CPU Time Spent: 18 seconds 900 msec
|
||
|
OK
|
||
|
163800
|
||
|
Time taken: 26.045 seconds, Fetched: 1 row(s)
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
### Task 2 ###
|
||
|
|
||
|
hive> select count(*) from flights_partitioned_month where month = 4;
|
||
|
|
||
|
Stage-Stage-1: Map: 2 Reduce: 1 Cumulative CPU: 5.89 sec HDFS Read: 40967391 HDFS Write: 7 SUCCESS
|
||
|
Total MapReduce CPU Time Spent: 5 seconds 890 msec
|
||
|
OK
|
||
|
436007
|
||
|
Time taken: 21.732 seconds, Fetched: 1 row(s)
|
||
|
|
||
|
|
||
|
|
||
|
hive> select count(*) from flights_partitioned_month where month = 11 and dayofmonth =6;
|
||
|
|
||
|
Stage-Stage-1: Map: 1 Reduce: 1 Cumulative CPU: 4.77 sec HDFS Read: 40919465 HDFS Write: 6 SUCCESS
|
||
|
Total MapReduce CPU Time Spent: 4 seconds 770 msec
|
||
|
OK
|
||
|
15092
|
||
|
Time taken: 22.742 seconds, Fetched: 1 row(s)
|
||
|
|
||
|
|
||
|
|
||
|
hive> select count(*) from flights_partitioned_month where month = 8 and dayofmonth > 9 and dayofmonth < 21;
|
||
|
|
||
|
Stage-Stage-1: Map: 1 Reduce: 1 Cumulative CPU: 4.96 sec HDFS Read: 43445074 HDFS Write: 7 SUCCESS
|
||
|
Total MapReduce CPU Time Spent: 4 seconds 960 msec
|
||
|
OK
|
||
|
163800
|
||
|
Time taken: 23.878 seconds, Fetched: 1 row(s)
|
||
|
|
||
|
|
||
|
|
||
|
Q1: The CPU time has been significantly reduced due to partitioning.
|
||
|
Q2: The time taken is shorter, but not significantly.
|
||
|
Q3: There is probably a lot of network overhead that is invovled here that contributes to a high wall clock duration.
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
### Task 3 ###
|
||
|
|
||
|
Partition on dayOfMonth:
|
||
|
|
||
|
hive> create table flights_partitioned_dayofmonth(Year int, Month int, dayOfWeek
|
||
|
int, depTime int, CRSDepTime int, arrTime int, CRSArrTime int, uniqueCarrier string,
|
||
|
flightNum int, tailNum int, actualElapsedTime int, CRSElapsedTime int, airTime int,
|
||
|
arrDelay int, depDelay int, origin string, dest string, distance int, taxiIn int,
|
||
|
taxiOut int, cancelled int, cancellationCode string, diverted int, carrierDelay int,
|
||
|
weatherDelay int, NASDelay int, securityDelay int, lateAircraftDelay int)
|
||
|
partitioned by (dayOfMonth int);
|
||
|
|
||
|
hive> insert into table flights_partitioned_dayofmonth partition(dayofmonth) select year,
|
||
|
dayofmonth, dayofweek, deptime, crsdeptime, arrtime, crsarrtime, uniquecarrier,
|
||
|
flightnum, tailnum, actualelapsedtime, crselapsedtime, airtime, arrdelay, depdelay,
|
||
|
origin, dest, distance, taxiin, taxiout, cancelled, cancellationcode, diverted,
|
||
|
carrierdelay, weatherdelay, nasdelay, securitydelay, lateaircraftdelay, month from
|
||
|
flights;
|
||
|
|
||
|
|
||
|
|
||
|
hive> select count(*) from flights_partitioned_dayofmonth where month = 4;
|
||
|
|
||
|
Stage-Stage-1: Map: 10 Reduce: 1 Cumulative CPU: 37.51 sec HDFS Read: 500055304 HDFS Write: 7 SUCCESS
|
||
|
Total MapReduce CPU Time Spent: 37 seconds 510 msec
|
||
|
OK
|
||
|
436007
|
||
|
Time taken: 25.869 seconds, Fetched: 1 row(s)
|
||
|
|
||
|
|
||
|
|
||
|
hive> select count(*) from flights_partitioned_dayofmonth where month = 11 and dayofmonth =6;
|
||
|
|
||
|
Stage-Stage-1: Map: 1 Reduce: 1 Cumulative CPU: 5.01 sec HDFS Read: 16261930 HDFS Write: 6 SUCCESS
|
||
|
Total MapReduce CPU Time Spent: 5 seconds 10 msec
|
||
|
OK
|
||
|
15092
|
||
|
Time taken: 24.038 seconds, Fetched: 1 row(s)
|
||
|
|
||
|
|
||
|
|
||
|
hive> select count(*) from flights_partitioned_dayofmonth where month = 8 and dayofmonth > 9 and dayofmonth < 21;
|
||
|
|
||
|
Stage-Stage-1: Map: 7 Reduce: 1 Cumulative CPU: 24.53 sec HDFS Read: 181117544 HDFS Write: 7 SUCCESS
|
||
|
Total MapReduce CPU Time Spent: 24 seconds 530 msec
|
||
|
OK
|
||
|
163800
|
||
|
Time taken: 25.902 seconds, Fetched: 1 row(s)
|
||
|
|
||
|
|
||
|
|
||
|
Partition on Month and dayOfMonth:
|
||
|
|
||
|
hive> create table flights_partitioned_month_dayofmonth(Year int, dayOfWeek
|
||
|
int, depTime int, CRSDepTime int, arrTime int, CRSArrTime int, uniqueCarrier string,
|
||
|
flightNum int, tailNum int, actualElapsedTime int, CRSElapsedTime int, airTime int,
|
||
|
arrDelay int, depDelay int, origin string, dest string, distance int, taxiIn int,
|
||
|
taxiOut int, cancelled int, cancellationCode string, diverted int, carrierDelay int,
|
||
|
weatherDelay int, NASDelay int, securityDelay int, lateAircraftDelay int)
|
||
|
partitioned by (Month int, dayOfMonth int);
|
||
|
|
||
|
hive> insert into table flights_partitioned_month_dayofmonth partition(month, dayofmonth) select year,
|
||
|
dayofweek, deptime, crsdeptime, arrtime, crsarrtime, uniquecarrier,
|
||
|
flightnum, tailnum, actualelapsedtime, crselapsedtime, airtime, arrdelay, depdelay,
|
||
|
origin, dest, distance, taxiin, taxiout, cancelled, cancellationcode, diverted,
|
||
|
carrierdelay, weatherdelay, nasdelay, securitydelay, lateaircraftdelay, month, dayofmonth from
|
||
|
flights;
|
||
|
|
||
|
|
||
|
|
||
|
hive> select count(*) from flights_partitioned_month_dayofmonth where month = 4;
|
||
|
|
||
|
Stage-Stage-1: Map: 10 Reduce: 1 Cumulative CPU: 26.8 sec HDFS Read: 39950203 HDFS Write: 7 SUCCESS
|
||
|
Total MapReduce CPU Time Spent: 26 seconds 800 msec
|
||
|
OK
|
||
|
436007
|
||
|
Time taken: 25.94 seconds, Fetched: 1 row(s)
|
||
|
|
||
|
|
||
|
|
||
|
hive> select count(*) from flights_partitioned_month_dayofmonth where month = 11 and dayofmonth =6;
|
||
|
|
||
|
Stage-Stage-1: Map: 1 Reduce: 1 Cumulative CPU: 3.97 sec HDFS Read: 1388476 HDFS Write: 6 SUCCESS
|
||
|
Total MapReduce CPU Time Spent: 3 seconds 970 msec
|
||
|
OK
|
||
|
15092
|
||
|
Time taken: 24.934 seconds, Fetched: 1 row(s)
|
||
|
|
||
|
|
||
|
|
||
|
hive> select count(*) from flights_partitioned_month_dayofmonth where month = 8 and dayofmonth > 9 and dayofmonth < 21;
|
||
|
|
||
|
Stage-Stage-1: Map: 1 Reduce: 1 Cumulative CPU: 4.15 sec HDFS Read: 14961499 HDFS Write: 7 SUCCESS
|
||
|
Total MapReduce CPU Time Spent: 4 seconds 150 msec
|
||
|
OK
|
||
|
163800
|
||
|
Time taken: 24.476 seconds, Fetched: 1 row(s)
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
### Task 4 ###
|
||
|
|
||
|
[mocull_ostack@openstack-vm-11-30 ~]$ date +"%T"; cat /home/data/1996_noheader.csv | awk -F',' '$2 == "8" {print $1}' | wc -l; date +"%T"
|
||
|
21:04:23
|
||
|
462294
|
||
|
21:04:27
|
||
|
|
||
|
|
||
|
|
||
|
hive> select count(*) from flights where month = 8;
|
||
|
|
||
|
Stage-Stage-1: Map: 4 Reduce: 1 Cumulative CPU: 18.21 sec HDFS Read: 534149555 HDFS Write: 7 SUCCESS
|
||
|
Total MapReduce CPU Time Spent: 18 seconds 210 msec
|
||
|
OK
|
||
|
462294
|
||
|
Time taken: 22.984 seconds, Fetched: 1 row(s)
|
||
|
|
||
|
|
||
|
|
||
|
Using AWK is much faster than Hive because it's all on one system, and the processing and networking needed
|
||
|
to recombine the data is not present anymore. AWK is about 19 seconds faster.
|
||
|
|
||
|
|
||
|
|