library(dplyr)
library(ggplot2)
library(ggthemes)
library(plotly)
library(missForest)
properties_df = read.csv("properties_2016.csv")
training_df = read.csv('train_2016_v2.csv')
train_prop_df = merge(properties_df,training_df,by='parcelid')
date_counts = train_prop_df %>% group_by(transactiondate) %>% summarise(count=n())
date_counts$transactiondate=as.Date(date_counts$transactiondate)
ggplotly(ggplot(date_counts,aes(x=date_counts$transactiondate,y=date_counts$count,group=1,text=paste0("Date: ",transactiondate,"<br>","Count: ",count ))) + geom_path(colour="#FF6600")+xlab("Date")+ylab("Count")+theme_minimal(),tooltip='text')
head(train_prop_df)
missing_data_by_col=colSums(is.na(train_prop_df))/nrow(train_prop_df)
missing_data_by_col=as.data.frame(missing_data_by_col)
missing_data_by_col$column_name=rownames(missing_data_by_col)
ggplotly(ggplot(missing_data_by_col,aes(x=reorder(column_name,missing_data_by_col),y=missing_data_by_col,text=paste0("Column: ",column_name,"<br>","Percentage: ",missing_data_by_col)))+geom_bar(stat = "identity",fill='yellow')+
coord_flip()+xlab('Columns')+ylab('Missing Data by Percentage')+theme_minimal()+theme(axis.text.y =element_text(size=5, angle=20)),tooltip = 'text')
cat_log_error=seq(min(train_prop_df$logerror),max(train_prop_df$logerror),.9)
for(k in 1:length(cat_log_error )){
j=k
if(j!=length(cat_log_error)){
train_prop_df[(train_prop_df$logerror >= cat_log_error[j] & train_prop_df$logerror< cat_log_error[j+1]),61]=paste0(cat_log_error[j]," to ", cat_log_error[j+1])
}else{
train_prop_df[(train_prop_df$logerror >= cat_log_error[j]),61]=paste0(cat_log_error[j])
}
}
names(train_prop_df)[61]='logerror_cat'
train_prop_df$logerror_cat=as.character(train_prop_df$logerror_cat)
train_prop_df$logerror_cat=as.factor(train_prop_df$logerror_cat)
logerror_year=train_prop_df %>% group_by(yearbuilt,logerror_cat) %>% summarise(count=n())
ggplotly(ggplot(logerror_year,aes(x=yearbuilt,y=count,fill=logerror_cat))+geom_bar(stat='identity')+theme_minimal())
Hmm. interesting, what we see is that most of the logerror(our predictor value) lie between -1.05 to .795. Lets look more in depth into this range, and see how it splits up based on when the house was built.
log_indepth_df=train_prop_df[(train_prop_df$logerror >= -1.05 & train_prop_df$logerror< .795),]
cat_log_error=seq(min(log_indepth_df$logerror),max(log_indepth_df$logerror),.2)
for(k in 1:length(cat_log_error )){
j=k
if(j!=length(cat_log_error)){
log_indepth_df[((log_indepth_df$logerror >= cat_log_error[j]) & (log_indepth_df$logerror< cat_log_error[j+1])),61]=paste0(cat_log_error[j]," to ", cat_log_error[j+1])
}else{
log_indepth_df[(log_indepth_df$logerror >= cat_log_error[j]),61]=paste0(cat_log_error[j])
}
}
names(log_indepth_df)[61]='logerror_cat_in_depth'
log_indepth_df$logerror_cat_in_depth=as.character(log_indepth_df$logerror_cat_in_depth)
log_indepth_df$logerror_cat_in_depth=as.factor(log_indepth_df$logerror_cat_in_depth)
logerror_year=log_indepth_df %>% group_by(yearbuilt,logerror_cat_in_depth) %>% summarise(count=n())
ggplotly(ggplot(logerror_year,aes(x=yearbuilt,y=count,fill=logerror_cat_in_depth))+geom_bar(stat='identity')+theme_minimal())
bed_year=train_prop_df %>% group_by(bedroomcnt,logerror_cat) %>% summarise(count=n())
ggplotly(ggplot(bed_year,aes(x=bedroomcnt,y=count,fill=logerror_cat))+geom_bar(stat='identity')+theme_minimal())
logerror_year=train_prop_df %>% group_by(yearbuilt,logerror_cat) %>% summarise(count=n()) %>% mutate(per=(round(count/sum(count)*100, 2)))
see=logerror_year[logerror_year$logerror_cat=='-0.105 to 0.795',]
ggplotly(ggplot(see,aes(x=yearbuilt,y=per))+geom_bar(stat='identity',fill="#F8766D",colour="grey")+theme_minimal()+labs(title ="-0.105 to 0.795 Distribution")+ylab("percent"))
names_less_than_40per = missing_data_by_col[missing_data_by_col$missing_data_by_col<.4,2]
train_complete_data = train_prop_df[,names_less_than_40per]
train_complete_data=train_complete_data[,-c(1,8,10,20,18,21,31,32,25,28,15,17,34)]
train_prop_df_miss = prodNA(train_complete_data, noNA = 0.05)
summary(train_prop_df_miss)
bathroomcnt bedroomcnt buildingqualitytypeid
Min. : 0.00 Min. : 0.000 Min. : 1.00
1st Qu.: 2.00 1st Qu.: 2.000 1st Qu.: 4.00
Median : 2.00 Median : 3.000 Median : 7.00
Mean : 2.28 Mean : 3.031 Mean : 5.56
3rd Qu.: 3.00 3rd Qu.: 4.000 3rd Qu.: 7.00
Max. :20.00 Max. :16.000 Max. :12.00
NA's :4634 NA's :4540 NA's :35778
calculatedbathnbr calculatedfinishedsquarefeet finishedsquarefeet12
Min. : 1.000 Min. : 2 Min. : 2
1st Qu.: 2.000 1st Qu.: 1183 1st Qu.: 1172
Median : 2.000 Median : 1540 Median : 1518
Mean : 2.309 Mean : 1773 Mean : 1746
3rd Qu.: 3.000 3rd Qu.: 2094 3rd Qu.: 2057
Max. :20.000 Max. :22741 Max. :20013
NA's :5625 NA's :5037 NA's :8884
fullbathcnt heatingorsystemtypeid latitude
Min. : 1.000 Min. : 1.00 Min. :33339295
1st Qu.: 2.000 1st Qu.: 2.00 1st Qu.:33811683
Median : 2.000 Median : 2.00 Median :34021602
Mean : 2.242 Mean : 3.92 Mean :34005381
3rd Qu.: 3.000 3rd Qu.: 7.00 3rd Qu.:34172500
Max. :20.000 Max. :24.00 Max. :34816009
NA's :5654 NA's :37019 NA's :4446
longitude lotsizesquarefeet propertylandusetypeid
Min. :-119447353 Min. : 167 Min. : 31.0
1st Qu.:-118411412 1st Qu.: 5706 1st Qu.:261.0
Median :-118172908 Median : 7200 Median :261.0
Mean :-118198550 Mean : 29205 Mean :261.8
3rd Qu.:-117921344 3rd Qu.: 11694 3rd Qu.:266.0
Max. :-117554924 Max. :6971010 Max. :275.0
NA's :4575 NA's :14136 NA's :4540
regionidcity roomcnt unitcnt yearbuilt
Min. : 3491 Min. : 0.000 Min. : 1.00 Min. :1885
1st Qu.: 12447 1st Qu.: 0.000 1st Qu.: 1.00 1st Qu.:1953
Median : 25218 Median : 0.000 Median : 1.00 Median :1970
Mean : 33705 Mean : 1.477 Mean : 1.11 Mean :1969
3rd Qu.: 45457 3rd Qu.: 0.000 3rd Qu.: 1.00 3rd Qu.:1987
Max. :396556 Max. :18.000 Max. :143.00 Max. :2015
NA's :6251 NA's :4546 NA's :34827 NA's :5145
structuretaxvaluedollarcnt taxvaluedollarcnt landtaxvaluedollarcnt
Min. : 100 Min. : 22 Min. : 22
1st Qu.: 81459 1st Qu.: 198834 1st Qu.: 82244
Median : 132167 Median : 342985 Median : 193004
Mean : 180211 Mean : 457896 Mean : 277897
3rd Qu.: 210596 3rd Qu.: 540085 3rd Qu.: 345694
Max. :9948100 Max. :27750000 Max. :24500000
NA's :4752 NA's :4528 NA's :4533
taxamount logerror
Min. : 49.1 Min. :-4.605
1st Qu.: 2872.3 1st Qu.:-0.025
Median : 4540.8 Median : 0.006
Mean : 5987.6 Mean : 0.012
3rd Qu.: 6899.3 3rd Qu.: 0.039
Max. :321936.1 Max. : 4.737
NA's :4611 NA's :4550
summary(train_complete_data)
bathroomcnt bedroomcnt buildingqualitytypeid
Min. : 0.000 Min. : 0.000 Min. : 1.00
1st Qu.: 2.000 1st Qu.: 2.000 1st Qu.: 4.00
Median : 2.000 Median : 3.000 Median : 7.00
Mean : 2.279 Mean : 3.032 Mean : 5.57
3rd Qu.: 3.000 3rd Qu.: 4.000 3rd Qu.: 7.00
Max. :20.000 Max. :16.000 Max. :12.00
NA's :32911
calculatedbathnbr calculatedfinishedsquarefeet finishedsquarefeet12
Min. : 1.000 Min. : 2 Min. : 2
1st Qu.: 2.000 1st Qu.: 1184 1st Qu.: 1172
Median : 2.000 Median : 1540 Median : 1518
Mean : 2.309 Mean : 1773 Mean : 1745
3rd Qu.: 3.000 3rd Qu.: 2095 3rd Qu.: 2056
Max. :20.000 Max. :22741 Max. :20013
NA's :1182 NA's :661 NA's :4679
fullbathcnt heatingorsystemtypeid latitude
Min. : 1.000 Min. : 1.00 Min. :33339295
1st Qu.: 2.000 1st Qu.: 2.00 1st Qu.:33811538
Median : 2.000 Median : 2.00 Median :34021500
Mean : 2.241 Mean : 3.93 Mean :34005411
3rd Qu.: 3.000 3rd Qu.: 7.00 3rd Qu.:34172742
Max. :20.000 Max. :24.00 Max. :34816009
NA's :1182 NA's :34195
longitude lotsizesquarefeet propertylandusetypeid
Min. :-119447865 Min. : 167 Min. : 31.0
1st Qu.:-118411692 1st Qu.: 5703 1st Qu.:261.0
Median :-118173431 Median : 7200 Median :261.0
Mean :-118198868 Mean : 29110 Mean :261.8
3rd Qu.:-117921588 3rd Qu.: 11686 3rd Qu.:266.0
Max. :-117554924 Max. :6971010 Max. :275.0
NA's :10150
regionidcity roomcnt unitcnt yearbuilt
Min. : 3491 Min. : 0.000 Min. : 1.00 Min. :1885
1st Qu.: 12447 1st Qu.: 0.000 1st Qu.: 1.00 1st Qu.:1953
Median : 25218 Median : 0.000 Median : 1.00 Median :1970
Mean : 33761 Mean : 1.479 Mean : 1.11 Mean :1969
3rd Qu.: 45457 3rd Qu.: 0.000 3rd Qu.: 1.00 3rd Qu.:1987
Max. :396556 Max. :18.000 Max. :143.00 Max. :2015
NA's :1803 NA's :31922 NA's :756
structuretaxvaluedollarcnt taxvaluedollarcnt landtaxvaluedollarcnt
Min. : 100 Min. : 22 Min. : 22
1st Qu.: 81245 1st Qu.: 199023 1st Qu.: 82228
Median : 132000 Median : 342872 Median : 192970
Mean : 180093 Mean : 457673 Mean : 278335
3rd Qu.: 210534 3rd Qu.: 540589 3rd Qu.: 345420
Max. :9948100 Max. :27750000 Max. :24500000
NA's :380 NA's :1 NA's :1
taxamount logerror
Min. : 49.1 Min. :-4.60500
1st Qu.: 2872.8 1st Qu.:-0.02530
Median : 4542.8 Median : 0.00600
Mean : 5984.0 Mean : 0.01146
3rd Qu.: 6901.1 3rd Qu.: 0.03920
Max. :321936.1 Max. : 4.73700
NA's :6
library(doParallel)
Loading required package: parallel
registerDoParallel(cores=3)
train.imp=missForest(train_prop_df_miss,parallelize = "forests",ntree = 75)
missForest iteration 1 in progress...done!
missForest iteration 2 in progress...done!
missForest iteration 3 in progress...done!
missForest iteration 4 in progress...done!
missForest iteration 5 in progress...done!
train.imp$ximp
train.imp$OOBerror
NRMSE
0.001615067
train.err <- mixError(train.imp$ximp, train_prop_df_miss, train_complete_data)
train.err
NRMSE
NA