From 881b9206dce7a6b2d44239d435535596dcfde9ae Mon Sep 17 00:00:00 2001 From: Max O'Cull Date: Fri, 30 Nov 2018 16:50:50 -0500 Subject: [PATCH] Finish Project 4 --- Project_4/mocull_project4.txt | 206 ++++++++++++++++++++++++++++++++++ 1 file changed, 206 insertions(+) create mode 100644 Project_4/mocull_project4.txt diff --git a/Project_4/mocull_project4.txt b/Project_4/mocull_project4.txt new file mode 100644 index 0000000..ca1e084 --- /dev/null +++ b/Project_4/mocull_project4.txt @@ -0,0 +1,206 @@ +### Task 1 ### + +hive> select count(*) from flights where month = 4; + +stage-stage-1: map: 4 reduce: 1 cumulative cpu: 21.83 sec hdfs read: 534149548 hdfs write: 7 success +total mapreduce cpu time spent: 21 seconds 830 msec +ok +436007 +time taken: 27.307 seconds, fetched: 1 row(s) + + + +hive> select count(*) from flights where month = 11 and dayofmonth =6; + +Stage-Stage-1: Map: 4 Reduce: 1 Cumulative CPU: 21.41 sec HDFS Read: 534150190 HDFS Write: 6 SUCCESS +Total MapReduce CPU Time Spent: 21 seconds 410 msec +OK +15092 +Time taken: 27.366 seconds, Fetched: 1 row(s) + + + +hive> select count(*) from flights where month = 8 and dayofmonth > 9 and dayofmonth < 21; + +Stage-Stage-1: Map: 4 Reduce: 1 Cumulative CPU: 18.9 sec HDFS Read: 534151190 HDFS Write: 7 SUCCESS +Total MapReduce CPU Time Spent: 18 seconds 900 msec +OK +163800 +Time taken: 26.045 seconds, Fetched: 1 row(s) + + + + + + + +### Task 2 ### + +hive> select count(*) from flights_partitioned_month where month = 4; + +Stage-Stage-1: Map: 2 Reduce: 1 Cumulative CPU: 5.89 sec HDFS Read: 40967391 HDFS Write: 7 SUCCESS +Total MapReduce CPU Time Spent: 5 seconds 890 msec +OK +436007 +Time taken: 21.732 seconds, Fetched: 1 row(s) + + + +hive> select count(*) from flights_partitioned_month where month = 11 and dayofmonth =6; + +Stage-Stage-1: Map: 1 Reduce: 1 Cumulative CPU: 4.77 sec HDFS Read: 40919465 HDFS Write: 6 SUCCESS +Total MapReduce CPU Time Spent: 4 seconds 770 msec +OK +15092 +Time taken: 22.742 seconds, Fetched: 1 row(s) + + + +hive> select count(*) from flights_partitioned_month where month = 8 and dayofmonth > 9 and dayofmonth < 21; + +Stage-Stage-1: Map: 1 Reduce: 1 Cumulative CPU: 4.96 sec HDFS Read: 43445074 HDFS Write: 7 SUCCESS +Total MapReduce CPU Time Spent: 4 seconds 960 msec +OK +163800 +Time taken: 23.878 seconds, Fetched: 1 row(s) + + + +Q1: The CPU time has been significantly reduced due to partitioning. +Q2: The time taken is shorter, but not significantly. +Q3: There is probably a lot of network overhead that is invovled here that contributes to a high wall clock duration. + + + + + + + +### Task 3 ### + +Partition on dayOfMonth: + +hive> create table flights_partitioned_dayofmonth(Year int, Month int, dayOfWeek +int, depTime int, CRSDepTime int, arrTime int, CRSArrTime int, uniqueCarrier string, +flightNum int, tailNum int, actualElapsedTime int, CRSElapsedTime int, airTime int, +arrDelay int, depDelay int, origin string, dest string, distance int, taxiIn int, +taxiOut int, cancelled int, cancellationCode string, diverted int, carrierDelay int, +weatherDelay int, NASDelay int, securityDelay int, lateAircraftDelay int) +partitioned by (dayOfMonth int); + +hive> insert into table flights_partitioned_dayofmonth partition(dayofmonth) select year, +dayofmonth, dayofweek, deptime, crsdeptime, arrtime, crsarrtime, uniquecarrier, +flightnum, tailnum, actualelapsedtime, crselapsedtime, airtime, arrdelay, depdelay, +origin, dest, distance, taxiin, taxiout, cancelled, cancellationcode, diverted, +carrierdelay, weatherdelay, nasdelay, securitydelay, lateaircraftdelay, month from +flights; + + + +hive> select count(*) from flights_partitioned_dayofmonth where month = 4; + +Stage-Stage-1: Map: 10 Reduce: 1 Cumulative CPU: 37.51 sec HDFS Read: 500055304 HDFS Write: 7 SUCCESS +Total MapReduce CPU Time Spent: 37 seconds 510 msec +OK +436007 +Time taken: 25.869 seconds, Fetched: 1 row(s) + + + +hive> select count(*) from flights_partitioned_dayofmonth where month = 11 and dayofmonth =6; + +Stage-Stage-1: Map: 1 Reduce: 1 Cumulative CPU: 5.01 sec HDFS Read: 16261930 HDFS Write: 6 SUCCESS +Total MapReduce CPU Time Spent: 5 seconds 10 msec +OK +15092 +Time taken: 24.038 seconds, Fetched: 1 row(s) + + + +hive> select count(*) from flights_partitioned_dayofmonth where month = 8 and dayofmonth > 9 and dayofmonth < 21; + +Stage-Stage-1: Map: 7 Reduce: 1 Cumulative CPU: 24.53 sec HDFS Read: 181117544 HDFS Write: 7 SUCCESS +Total MapReduce CPU Time Spent: 24 seconds 530 msec +OK +163800 +Time taken: 25.902 seconds, Fetched: 1 row(s) + + + +Partition on Month and dayOfMonth: + +hive> create table flights_partitioned_month_dayofmonth(Year int, dayOfWeek +int, depTime int, CRSDepTime int, arrTime int, CRSArrTime int, uniqueCarrier string, +flightNum int, tailNum int, actualElapsedTime int, CRSElapsedTime int, airTime int, +arrDelay int, depDelay int, origin string, dest string, distance int, taxiIn int, +taxiOut int, cancelled int, cancellationCode string, diverted int, carrierDelay int, +weatherDelay int, NASDelay int, securityDelay int, lateAircraftDelay int) +partitioned by (Month int, dayOfMonth int); + +hive> insert into table flights_partitioned_month_dayofmonth partition(month, dayofmonth) select year, +dayofweek, deptime, crsdeptime, arrtime, crsarrtime, uniquecarrier, +flightnum, tailnum, actualelapsedtime, crselapsedtime, airtime, arrdelay, depdelay, +origin, dest, distance, taxiin, taxiout, cancelled, cancellationcode, diverted, +carrierdelay, weatherdelay, nasdelay, securitydelay, lateaircraftdelay, month, dayofmonth from +flights; + + + +hive> select count(*) from flights_partitioned_month_dayofmonth where month = 4; + +Stage-Stage-1: Map: 10 Reduce: 1 Cumulative CPU: 26.8 sec HDFS Read: 39950203 HDFS Write: 7 SUCCESS +Total MapReduce CPU Time Spent: 26 seconds 800 msec +OK +436007 +Time taken: 25.94 seconds, Fetched: 1 row(s) + + + +hive> select count(*) from flights_partitioned_month_dayofmonth where month = 11 and dayofmonth =6; + +Stage-Stage-1: Map: 1 Reduce: 1 Cumulative CPU: 3.97 sec HDFS Read: 1388476 HDFS Write: 6 SUCCESS +Total MapReduce CPU Time Spent: 3 seconds 970 msec +OK +15092 +Time taken: 24.934 seconds, Fetched: 1 row(s) + + + +hive> select count(*) from flights_partitioned_month_dayofmonth where month = 8 and dayofmonth > 9 and dayofmonth < 21; + +Stage-Stage-1: Map: 1 Reduce: 1 Cumulative CPU: 4.15 sec HDFS Read: 14961499 HDFS Write: 7 SUCCESS +Total MapReduce CPU Time Spent: 4 seconds 150 msec +OK +163800 +Time taken: 24.476 seconds, Fetched: 1 row(s) + + + + + + + +### Task 4 ### + +[mocull_ostack@openstack-vm-11-30 ~]$ date +"%T"; cat /home/data/1996_noheader.csv | awk -F',' '$2 == "8" {print $1}' | wc -l; date +"%T" +21:04:23 +462294 +21:04:27 + + + +hive> select count(*) from flights where month = 8; + +Stage-Stage-1: Map: 4 Reduce: 1 Cumulative CPU: 18.21 sec HDFS Read: 534149555 HDFS Write: 7 SUCCESS +Total MapReduce CPU Time Spent: 18 seconds 210 msec +OK +462294 +Time taken: 22.984 seconds, Fetched: 1 row(s) + + + +Using AWK is much faster than Hive because it's all on one system, and the processing and networking needed +to recombine the data is not present anymore. AWK is about 19 seconds faster. + + +