library(ggplot2)
library(dplyr)
library(tidyverse)
#Question 6
#Reading in the dataset into Austin dataframe
Austin<-read.csv("Austin_Texas.csv")
str(Austin)
#(b)
Austin$delay<-Austin$arr_del15/Austin$arr_flights
ggplot(data=Austin, aes(x=time, y=delay, group=carrier)) + geom_line(aes(color=carrier))+ geom_point(aes(color=carrier))
#We see that proportion of flights operated by American are delayed the highest, particularly after 2015
tapply(AUS$delay,AUS$carrier, mean)
#AA has highest proportion of delayed flights followed by UA & DL in that order.
#(c)
#Subsetting the flight data from January 2014 into a separate dataset
Austin_2014<-Austin[which(Austin$year>=2014),]
str(Austin_2014)
#Boxplot of proportion of each airline's delayed flights January 2014 and beyond
ggplot(data=Austin_2014, aes(x=time, y=delay, group=carrier)) + geom_boxplot(aes(color=carrier))
#(d)
#We subset the flights' delay data of individual flight operators into separate vectors
AA<-Austin_2014[which(Austin_2014$carrier=="AA"),24]
DL<-Austin_2014[which(Austin_2014$carrier=="DL"),24]
UA<-Austin_2014[which(Austin_2014$carrier=="UA"),24]
#Before proceeding with the t-tests, we test the assumption that the variances are equal
var.test(AA, DL)
#P-value = 0.09612, at 5% significance levels, we fail to reject the null hypothesis that the variances are equal
var.test(DL, UA)
#P-value = 0.01792, at 5% significance levels, we reject the null hypothesis that the variances are equal
var.test(UA, AA)
#P-value = 0.4747, at 5% significance levels, we fail to reject the null hypothesis that the variances are equal
#T-tests can now be done, with the suitable adjustments for var.equal input
t.test(UA, DL,var.equal=F)
#P-value = 2.155e-06, at 5% significance levels, delays of UA and DL are statistically different
t.test(AA, UA,var.equal=T)
#P-value = 0.2215, at 5% significance levels, delays of UA and AA are not statistically different
t.test(AA, DL,var.equal=T)
#P-value = 6.345e-10, at 5% significance levels, delays of AA and DL are statistically different