This data set was imported from Kaggle. I chose to observe “Absenteeism at Work” to see what correlations I could find. I decided that I wanted to see how the month of absence and the amount of targets hit compared to each other.
Bootstrapping is where random samples of the data are used to construct multiple decision trees. Since each tree only sees part of the data, each tree is less accurate than if it had been constructed over the full data set. Thus, each tree is known as a weak learner. A more powerful, meta-estimator is subsequently constructed by averaging over these many weak learners. The approach of constructing weak learners, and combining them into a more powerful estimator, is at the heart of several, very powerful machine learning techniques, including the random forest.
I found that there are a couple of outliers in the months we observe as summer. In the winter, there is less absenteeism and a higher percentage of targets were hit. This data could be used to introduce an incentive to encourage workers to use less vacation time all at once. The targets that are set in place to ensure that the company is staying on track to reach investor expectations each year. By encouraging employees to spread out the use of their vacation time used, management would have a better chance at reaching those targets continuously throughout the year.
library(curl)
## Using libcurl 7.64.1 with Schannel
load(curl("https://raw.githubusercontent.com/Professor-Hunt/ACC8143/main/data/tips.rda"))
set.seed(0)
library(rsample)
## Warning: package 'rsample' was built under R version 4.1.2
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.5 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.1.1 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.1.2
## Warning: package 'readr' was built under R version 4.1.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x readr::parse_date() masks curl::parse_date()
#perform bootstrapping with 2000 replications
Absenteeism1 <- bootstraps(as.data.frame(data.for.project$Month.of.absence), times = 100)
#view results of boostrapping
knitr::kable(head(summary(Absenteeism1),5))
| splits.Length splits.Class splits.Mode | id | |
|---|---|---|
| 4 boot_split list | Length:100 | |
| 4 boot_split list | Class :character | |
| 4 boot_split list | Mode :character | |
| 4 boot_split list | NA | |
| 4 boot_split list | NA |
#info for a specific sample
Absenteeism1$splits[[1]]
## <Analysis/Assess/Total>
## <740/278/740>
#mean
mean(Absenteeism1$splits[[1]]$data$`data.for.project$Month.of.absence`)
## [1] 6.324324
#standard deviation
sd(Absenteeism1$splits[[1]]$data$`data.for.project$Month.of.absence`)
## [1] 3.436287
#get all of them
mean_values<-purrr::map_dbl(Absenteeism1$splits,
function(x) {
dat <- as.data.frame(x)$`data.for.project$Month.of.absence`
mean(dat)
})
#view the whole dataset
knitr::kable(mean_values)%>%
kableExtra::kable_styling("striped")%>%
kableExtra::scroll_box(width = "50%",height="300px")
| x |
|---|
| 6.527027 |
| 6.271622 |
| 6.404054 |
| 6.254054 |
| 6.113513 |
| 6.297297 |
| 6.285135 |
| 6.610811 |
| 6.131081 |
| 6.217568 |
| 6.463514 |
| 6.451351 |
| 6.404054 |
| 6.263514 |
| 6.174324 |
| 6.294595 |
| 6.591892 |
| 6.228378 |
| 6.310811 |
| 6.493243 |
| 6.232432 |
| 6.431081 |
| 6.364865 |
| 6.462162 |
| 6.195946 |
| 6.454054 |
| 6.237838 |
| 6.379730 |
| 6.495946 |
| 6.354054 |
| 6.217568 |
| 6.433784 |
| 6.018919 |
| 6.381081 |
| 6.314865 |
| 6.218919 |
| 6.266216 |
| 6.491892 |
| 6.221622 |
| 6.382432 |
| 6.437838 |
| 6.364865 |
| 6.275676 |
| 6.372973 |
| 6.294595 |
| 6.358108 |
| 6.495946 |
| 6.364865 |
| 6.236486 |
| 6.304054 |
| 6.318919 |
| 6.260811 |
| 6.295946 |
| 6.424324 |
| 6.589189 |
| 6.341892 |
| 6.382432 |
| 6.318919 |
| 6.370270 |
| 6.271622 |
| 6.348649 |
| 6.320270 |
| 6.359459 |
| 6.286486 |
| 6.501351 |
| 6.201351 |
| 6.200000 |
| 6.143243 |
| 6.017568 |
| 6.190541 |
| 6.393243 |
| 6.204054 |
| 6.455405 |
| 6.300000 |
| 6.095946 |
| 6.329730 |
| 6.490540 |
| 6.233784 |
| 6.331081 |
| 6.612162 |
| 6.208108 |
| 6.187838 |
| 6.433784 |
| 6.175676 |
| 6.416216 |
| 6.295946 |
| 6.368919 |
| 6.217568 |
| 6.313514 |
| 6.281081 |
| 6.202703 |
| 6.290540 |
| 6.241892 |
| 6.256757 |
| 6.243243 |
| 6.300000 |
| 6.313514 |
| 6.302703 |
| 6.513514 |
| 6.197297 |
#estimate of the population mean
mean(mean_values)
## [1] 6.321986
#get all of them
sd_values<-purrr::map_dbl(Absenteeism1$splits,
function(x) {
dat <- as.data.frame(x)$`data.for.project$Month.of.absence`
sd(dat)
})
#view the whole dataset
knitr::kable(sd_values)%>%
kableExtra::kable_styling("striped")%>%
kableExtra::scroll_box(width = "50%",height="300px")
| x |
|---|
| 3.532271 |
| 3.454793 |
| 3.477783 |
| 3.518410 |
| 3.413431 |
| 3.375987 |
| 3.380041 |
| 3.510648 |
| 3.457119 |
| 3.458623 |
| 3.351734 |
| 3.419131 |
| 3.461403 |
| 3.451111 |
| 3.381977 |
| 3.468743 |
| 3.548380 |
| 3.387553 |
| 3.399139 |
| 3.482601 |
| 3.456284 |
| 3.485808 |
| 3.420761 |
| 3.485120 |
| 3.355078 |
| 3.436932 |
| 3.383903 |
| 3.489845 |
| 3.341823 |
| 3.406441 |
| 3.405792 |
| 3.428322 |
| 3.468734 |
| 3.512694 |
| 3.442085 |
| 3.457168 |
| 3.407894 |
| 3.438804 |
| 3.514065 |
| 3.441522 |
| 3.392290 |
| 3.431425 |
| 3.439751 |
| 3.448256 |
| 3.506765 |
| 3.423255 |
| 3.495791 |
| 3.527101 |
| 3.451503 |
| 3.434795 |
| 3.497290 |
| 3.397974 |
| 3.388308 |
| 3.360755 |
| 3.349129 |
| 3.397942 |
| 3.430890 |
| 3.394004 |
| 3.405509 |
| 3.344538 |
| 3.413348 |
| 3.427007 |
| 3.503021 |
| 3.436110 |
| 3.370050 |
| 3.411554 |
| 3.523054 |
| 3.352299 |
| 3.402771 |
| 3.524155 |
| 3.555964 |
| 3.396685 |
| 3.504787 |
| 3.269865 |
| 3.514168 |
| 3.448352 |
| 3.425000 |
| 3.502668 |
| 3.469934 |
| 3.469506 |
| 3.477348 |
| 3.529671 |
| 3.381025 |
| 3.423069 |
| 3.547819 |
| 3.420900 |
| 3.436705 |
| 3.512971 |
| 3.362064 |
| 3.344362 |
| 3.438537 |
| 3.393166 |
| 3.556941 |
| 3.438856 |
| 3.494480 |
| 3.388151 |
| 3.444370 |
| 3.425645 |
| 3.350290 |
| 3.351165 |
#estimate of the population standard deviation
sd(sd_values)
## [1] 0.05819903
set.seed(1)
#lets split the data 60/40
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
trainData <- createDataPartition(data.for.project$Hit.target, p = .6, list = FALSE, times = 1)
#grab the data
DataTrain <- data.for.project[ trainIndex,]
DataTest <- data.for.project[-trainIndex,]
ggplot(data=DataTrain)+geom_point(mapping = aes(x=Month.of.absence,y=Hit.target,color=Month.of.absence),alpha=0.5) + labs(color = "Testing Targets")+ geom_point(data=DataTest, ,mapping = aes(x=Month.of.absence,y=Hit.target,)) + labs(shape = "Testing Targets") +
ggtitle("Absenteeism")+
theme(plot.title = element_text(hjust=0.5, size=10, face='bold'))
