cs348/Project_4/mocull_project4.txt
2018-11-30 16:50:50 -05:00

207 lines
6.4 KiB
Plaintext

### Task 1 ###
hive> select count(*) from flights where month = 4;
stage-stage-1: map: 4 reduce: 1 cumulative cpu: 21.83 sec hdfs read: 534149548 hdfs write: 7 success
total mapreduce cpu time spent: 21 seconds 830 msec
ok
436007
time taken: 27.307 seconds, fetched: 1 row(s)
hive> select count(*) from flights where month = 11 and dayofmonth =6;
Stage-Stage-1: Map: 4 Reduce: 1 Cumulative CPU: 21.41 sec HDFS Read: 534150190 HDFS Write: 6 SUCCESS
Total MapReduce CPU Time Spent: 21 seconds 410 msec
OK
15092
Time taken: 27.366 seconds, Fetched: 1 row(s)
hive> select count(*) from flights where month = 8 and dayofmonth > 9 and dayofmonth < 21;
Stage-Stage-1: Map: 4 Reduce: 1 Cumulative CPU: 18.9 sec HDFS Read: 534151190 HDFS Write: 7 SUCCESS
Total MapReduce CPU Time Spent: 18 seconds 900 msec
OK
163800
Time taken: 26.045 seconds, Fetched: 1 row(s)
### Task 2 ###
hive> select count(*) from flights_partitioned_month where month = 4;
Stage-Stage-1: Map: 2 Reduce: 1 Cumulative CPU: 5.89 sec HDFS Read: 40967391 HDFS Write: 7 SUCCESS
Total MapReduce CPU Time Spent: 5 seconds 890 msec
OK
436007
Time taken: 21.732 seconds, Fetched: 1 row(s)
hive> select count(*) from flights_partitioned_month where month = 11 and dayofmonth =6;
Stage-Stage-1: Map: 1 Reduce: 1 Cumulative CPU: 4.77 sec HDFS Read: 40919465 HDFS Write: 6 SUCCESS
Total MapReduce CPU Time Spent: 4 seconds 770 msec
OK
15092
Time taken: 22.742 seconds, Fetched: 1 row(s)
hive> select count(*) from flights_partitioned_month where month = 8 and dayofmonth > 9 and dayofmonth < 21;
Stage-Stage-1: Map: 1 Reduce: 1 Cumulative CPU: 4.96 sec HDFS Read: 43445074 HDFS Write: 7 SUCCESS
Total MapReduce CPU Time Spent: 4 seconds 960 msec
OK
163800
Time taken: 23.878 seconds, Fetched: 1 row(s)
Q1: The CPU time has been significantly reduced due to partitioning.
Q2: The time taken is shorter, but not significantly.
Q3: There is probably a lot of network overhead that is invovled here that contributes to a high wall clock duration.
### Task 3 ###
Partition on dayOfMonth:
hive> create table flights_partitioned_dayofmonth(Year int, Month int, dayOfWeek
int, depTime int, CRSDepTime int, arrTime int, CRSArrTime int, uniqueCarrier string,
flightNum int, tailNum int, actualElapsedTime int, CRSElapsedTime int, airTime int,
arrDelay int, depDelay int, origin string, dest string, distance int, taxiIn int,
taxiOut int, cancelled int, cancellationCode string, diverted int, carrierDelay int,
weatherDelay int, NASDelay int, securityDelay int, lateAircraftDelay int)
partitioned by (dayOfMonth int);
hive> insert into table flights_partitioned_dayofmonth partition(dayofmonth) select year,
dayofmonth, dayofweek, deptime, crsdeptime, arrtime, crsarrtime, uniquecarrier,
flightnum, tailnum, actualelapsedtime, crselapsedtime, airtime, arrdelay, depdelay,
origin, dest, distance, taxiin, taxiout, cancelled, cancellationcode, diverted,
carrierdelay, weatherdelay, nasdelay, securitydelay, lateaircraftdelay, month from
flights;
hive> select count(*) from flights_partitioned_dayofmonth where month = 4;
Stage-Stage-1: Map: 10 Reduce: 1 Cumulative CPU: 37.51 sec HDFS Read: 500055304 HDFS Write: 7 SUCCESS
Total MapReduce CPU Time Spent: 37 seconds 510 msec
OK
436007
Time taken: 25.869 seconds, Fetched: 1 row(s)
hive> select count(*) from flights_partitioned_dayofmonth where month = 11 and dayofmonth =6;
Stage-Stage-1: Map: 1 Reduce: 1 Cumulative CPU: 5.01 sec HDFS Read: 16261930 HDFS Write: 6 SUCCESS
Total MapReduce CPU Time Spent: 5 seconds 10 msec
OK
15092
Time taken: 24.038 seconds, Fetched: 1 row(s)
hive> select count(*) from flights_partitioned_dayofmonth where month = 8 and dayofmonth > 9 and dayofmonth < 21;
Stage-Stage-1: Map: 7 Reduce: 1 Cumulative CPU: 24.53 sec HDFS Read: 181117544 HDFS Write: 7 SUCCESS
Total MapReduce CPU Time Spent: 24 seconds 530 msec
OK
163800
Time taken: 25.902 seconds, Fetched: 1 row(s)
Partition on Month and dayOfMonth:
hive> create table flights_partitioned_month_dayofmonth(Year int, dayOfWeek
int, depTime int, CRSDepTime int, arrTime int, CRSArrTime int, uniqueCarrier string,
flightNum int, tailNum int, actualElapsedTime int, CRSElapsedTime int, airTime int,
arrDelay int, depDelay int, origin string, dest string, distance int, taxiIn int,
taxiOut int, cancelled int, cancellationCode string, diverted int, carrierDelay int,
weatherDelay int, NASDelay int, securityDelay int, lateAircraftDelay int)
partitioned by (Month int, dayOfMonth int);
hive> insert into table flights_partitioned_month_dayofmonth partition(month, dayofmonth) select year,
dayofweek, deptime, crsdeptime, arrtime, crsarrtime, uniquecarrier,
flightnum, tailnum, actualelapsedtime, crselapsedtime, airtime, arrdelay, depdelay,
origin, dest, distance, taxiin, taxiout, cancelled, cancellationcode, diverted,
carrierdelay, weatherdelay, nasdelay, securitydelay, lateaircraftdelay, month, dayofmonth from
flights;
hive> select count(*) from flights_partitioned_month_dayofmonth where month = 4;
Stage-Stage-1: Map: 10 Reduce: 1 Cumulative CPU: 26.8 sec HDFS Read: 39950203 HDFS Write: 7 SUCCESS
Total MapReduce CPU Time Spent: 26 seconds 800 msec
OK
436007
Time taken: 25.94 seconds, Fetched: 1 row(s)
hive> select count(*) from flights_partitioned_month_dayofmonth where month = 11 and dayofmonth =6;
Stage-Stage-1: Map: 1 Reduce: 1 Cumulative CPU: 3.97 sec HDFS Read: 1388476 HDFS Write: 6 SUCCESS
Total MapReduce CPU Time Spent: 3 seconds 970 msec
OK
15092
Time taken: 24.934 seconds, Fetched: 1 row(s)
hive> select count(*) from flights_partitioned_month_dayofmonth where month = 8 and dayofmonth > 9 and dayofmonth < 21;
Stage-Stage-1: Map: 1 Reduce: 1 Cumulative CPU: 4.15 sec HDFS Read: 14961499 HDFS Write: 7 SUCCESS
Total MapReduce CPU Time Spent: 4 seconds 150 msec
OK
163800
Time taken: 24.476 seconds, Fetched: 1 row(s)
### Task 4 ###
[mocull_ostack@openstack-vm-11-30 ~]$ date +"%T"; cat /home/data/1996_noheader.csv | awk -F',' '$2 == "8" {print $1}' | wc -l; date +"%T"
21:04:23
462294
21:04:27
hive> select count(*) from flights where month = 8;
Stage-Stage-1: Map: 4 Reduce: 1 Cumulative CPU: 18.21 sec HDFS Read: 534149555 HDFS Write: 7 SUCCESS
Total MapReduce CPU Time Spent: 18 seconds 210 msec
OK
462294
Time taken: 22.984 seconds, Fetched: 1 row(s)
Using AWK is much faster than Hive because it's all on one system, and the processing and networking needed
to recombine the data is not present anymore. AWK is about 19 seconds faster.