`library(tidyverse)`

`## -- Attaching packages ---------------------------- tidyverse 1.2.1 --`

```
## v ggplot2 3.1.0 v purrr 0.2.5
## v tibble 1.4.2 v dplyr 0.7.8
## v tidyr 0.8.2 v stringr 1.3.1
## v readr 1.3.1 v forcats 0.3.0
```

```
## -- Conflicts ------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
```

```
setwd("C:/Users/trevo/OneDrive/Documents/COSC 219/sanford")
Sanford_data=read.csv("Sanford_Data_Collaborative_Teaching_DataSet.csv")
```

`str(Sanford_data)`

```
## 'data.frame': 155143 obs. of 15 variables:
## $ ID : int 2 8 9 11 12 16 20 21 27 39 ...
## $ Sex : Factor w/ 3 levels "Female","Male",..: 2 1 1 2 2 2 1 1 2 2 ...
## $ Age : Factor w/ 73 levels "18","19","20",..: 65 33 43 42 48 35 56 42 22 21 ...
## $ Status : Factor w/ 2 levels "Alive","Deceased": 1 1 1 1 1 1 1 1 1 1 ...
## $ Hypertension : int 0 0 1 1 1 1 1 1 1 0 ...
## $ VascularDisease : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Payor : Factor w/ 3 levels "Medicaid","Medicare",..: 2 3 3 3 3 3 2 3 3 3 ...
## $ Diabetes : int 0 0 1 1 0 1 0 0 0 0 ...
## $ A1C : Factor w/ 129 levels "10","10.1","10.2",..: 129 129 89 105 129 110 129 129 129 129 ...
## $ BMI : Factor w/ 4812 levels "1.24","10.12",..: 1120 1707 1658 1810 2029 1885 2150 2056 4515 3232 ...
## $ ScheduledClinicVisits: Factor w/ 96 levels "0","1","10","102",..: 43 32 85 54 76 21 2 76 43 43 ...
## $ MissedClinicVisits : Factor w/ 26 levels "0","1","10","11",..: 1 1 1 1 2 1 1 1 1 2 ...
## $ DiastolicBP : int 72 91 78 79 82 69 64 82 86 73 ...
## $ SystolicBP : int 139 150 122 116 117 149 131 145 141 134 ...
## $ SmokingStatus : int 4 5 5 4 4 1 5 5 5 5 ...
```

`Sanford_data$Age=as.numeric(as.character(Sanford_data$Age))`

`## Warning: NAs introduced by coercion`

`Sanford_data$BMI=as.numeric(as.character(Sanford_data$BMI))`

`## Warning: NAs introduced by coercion`

`Sanford_dead <- Sanford_data[which(Sanford_data$Status == "Deceased"), names(Sanford_data) %in% c("ID", "Sex","Age","BMI","Diabetes","SmokingStatus")]`

`cor.test (x = Sanford_dead$Age, y = Sanford_dead$SmokingStatus, method = 'pearson')`

```
##
## Pearson's product-moment correlation
##
## data: Sanford_dead$Age and Sanford_dead$SmokingStatus
## t = 11.734, df = 2668, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1851462 0.2572961
## sample estimates:
## cor
## 0.2215243
```

`cor.test (x = Sanford_dead$Age, y = Sanford_dead$BMI, method = 'pearson')`

```
##
## Pearson's product-moment correlation
##
## data: Sanford_dead$Age and Sanford_dead$BMI
## t = -0.8219, df = 2144, p-value = 0.4112
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.06001594 0.02458419
## sample estimates:
## cor
## -0.01774764
```

`cor.test (x = Sanford_dead$BMI, y = Sanford_dead$SmokingStatus, method = 'pearson')`

```
##
## Pearson's product-moment correlation
##
## data: Sanford_dead$BMI and Sanford_dead$SmokingStatus
## t = -0.14203, df = 2734, p-value = 0.8871
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.04018583 0.03476072
## sample estimates:
## cor
## -0.002716367
```

`cor.test (x = Sanford_data$BMI, y = Sanford_data$SmokingStatus, method = 'pearson')`

```
##
## Pearson's product-moment correlation
##
## data: Sanford_data$BMI and Sanford_data$SmokingStatus
## t = 2.52, df = 144900, p-value = 0.01174
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.001471188 0.011768433
## sample estimates:
## cor
## 0.006619986
```

`ggplot(Sanford_dead, aes(Sanford_dead$SmokingStatus, Sanford_dead$Age)) + geom_jitter(color = "midnightblue", size = 0.75) + theme_bw() + labs(title = "Figure 2.0: Age at Death and Smoking Status of Dead Individuals in the SDC", x = "Smoking Status", y = "Age at Death")`

`## Warning: Removed 993 rows containing missing values (geom_point).`

```
Sanford_live <- Sanford_data[which(Sanford_data$Status == "Alive"), names(Sanford_data) %in% c("ID", "Sex","Age","BMI","Diabetes","SmokingStatus")]
cor.test (x = Sanford_live$Age, y = Sanford_live$SmokingStatus, method = 'pearson')
```

```
##
## Pearson's product-moment correlation
##
## data: Sanford_live$Age and Sanford_live$SmokingStatus
## t = 45.696, df = 145770, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1137748 0.1238968
## sample estimates:
## cor
## 0.1188389
```

`ggplot(Sanford_data, aes(Sanford_data$Age)) + geom_histogram(binwidth = 1, fill = "seashell4") + theme_bw() + labs(title = "Figure 3.0: Frequency of Age of Individuals in SDC", x = "Age", y = "Frequency")`

`## Warning: Removed 6702 rows containing non-finite values (stat_bin).`

`ggplot(Sanford_dead, aes(Sanford_dead$Age)) + geom_histogram(binwidth = 1, fill = "seashell4") + theme_bw() + labs(title = "Figure 3.1: Frequency of Age of Deceased Individuals in SDC", x = "Age", y = "Frequency")`

`## Warning: Removed 993 rows containing non-finite values (stat_bin).`

`ggplot(Sanford_live, aes(Sanford_live$Age)) + geom_histogram(binwidth = 1, fill = "seashell4") + theme_bw() + labs(title = "Figure 3.2: Frequency of Age of Living Individuals in SDC", x = "Age", y = "Frequency")`

`## Warning: Removed 5709 rows containing non-finite values (stat_bin).`

```
Sanford_live$YearBorn = 2018 - Sanford_live$Age
ggplot(Sanford_live, aes(Sanford_live$YearBorn)) + geom_histogram(binwidth = 1, fill = "seashell4") + theme_bw() + labs(title = "Figure 3.3: Frequency of Year of Birth of Living Individuals in SDC", x = "Age", y = "Frequency")
```

`## Warning: Removed 5709 rows containing non-finite values (stat_bin).`

```
SmokingHistory = aggregate(Sanford_live$SmokingStatus, by = list(Sanford_live$YearBorn), FUN = mean)
names(SmokingHistory) <- c("YearBorn", "AverageSmokingStatus")
ggplot(SmokingHistory, aes(SmokingHistory$YearBorn, SmokingHistory$AverageSmokingStatus)) + geom_line(color = "goldenrod4", size = 1) +theme_bw() + labs(title = "Figure 4.0: Average Smoking Status of Individuals Born from 1929 to 2000", x = "Year of Birth", y = "Average Smoking Status")
```

`cor.test (x = SmokingHistory$YearBorn, y = SmokingHistory$AverageSmokingStatus, method = 'pearson')`

```
##
## Pearson's product-moment correlation
##
## data: SmokingHistory$YearBorn and SmokingHistory$AverageSmokingStatus
## t = -3.3678, df = 70, p-value = 0.001235
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.5569019 -0.1551636
## sample estimates:
## cor
## -0.3734094
```