## RMD Code

#### Chunk 1: Initialized Database

library(tidyverse)
## -- Attaching packages ---------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.0     v purrr   0.2.5
## v tibble  1.4.2     v dplyr   0.7.8
## v tidyr   0.8.2     v stringr 1.3.1
## v readr   1.3.1     v forcats 0.3.0
## -- Conflicts ------------------------------- tidyverse_conflicts() --
## x dplyr::lag()    masks stats::lag()
setwd("C:/Users/trevo/OneDrive/Documents/COSC 219/sanford")
Sanford_data=read.csv("Sanford_Data_Collaborative_Teaching_DataSet.csv")

#### Chunk 2: Modifying data.Set Age and BMI as numeric factors. SmokingStatus was already set as numeric. Created a data set with all deceased people labelled Sanford_dead

str(Sanford_data)
## 'data.frame':    155143 obs. of  15 variables:
##  $ID : int 2 8 9 11 12 16 20 21 27 39 ... ##$ Sex                  : Factor w/ 3 levels "Female","Male",..: 2 1 1 2 2 2 1 1 2 2 ...
##  $Age : Factor w/ 73 levels "18","19","20",..: 65 33 43 42 48 35 56 42 22 21 ... ##$ Status               : Factor w/ 2 levels "Alive","Deceased": 1 1 1 1 1 1 1 1 1 1 ...
##  $Hypertension : int 0 0 1 1 1 1 1 1 1 0 ... ##$ VascularDisease      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $Payor : Factor w/ 3 levels "Medicaid","Medicare",..: 2 3 3 3 3 3 2 3 3 3 ... ##$ Diabetes             : int  0 0 1 1 0 1 0 0 0 0 ...
##  $A1C : Factor w/ 129 levels "10","10.1","10.2",..: 129 129 89 105 129 110 129 129 129 129 ... ##$ BMI                  : Factor w/ 4812 levels "1.24","10.12",..: 1120 1707 1658 1810 2029 1885 2150 2056 4515 3232 ...
##  $ScheduledClinicVisits: Factor w/ 96 levels "0","1","10","102",..: 43 32 85 54 76 21 2 76 43 43 ... ##$ MissedClinicVisits   : Factor w/ 26 levels "0","1","10","11",..: 1 1 1 1 2 1 1 1 1 2 ...
##  $DiastolicBP : int 72 91 78 79 82 69 64 82 86 73 ... ##$ SystolicBP           : int  139 150 122 116 117 149 131 145 141 134 ...
##  $SmokingStatus : int 4 5 5 4 4 1 5 5 5 5 ... Sanford_data$Age=as.numeric(as.character(Sanford_data$Age)) ## Warning: NAs introduced by coercion Sanford_data$BMI=as.numeric(as.character(Sanford_data$BMI)) ## Warning: NAs introduced by coercion Sanford_dead <- Sanford_data[which(Sanford_data$Status == "Deceased"), names(Sanford_data) %in% c("ID", "Sex","Age","BMI","Diabetes","SmokingStatus")]

#### Chunk 3: Running correlations between BMI, Age, and Smoking Status in Sanford_dead

cor.test (x = Sanford_dead$Age, y = Sanford_dead$SmokingStatus, method = 'pearson')
##
##  Pearson's product-moment correlation
##
## data:  Sanford_dead$Age and Sanford_dead$SmokingStatus
## t = 11.734, df = 2668, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1851462 0.2572961
## sample estimates:
##       cor
## 0.2215243
cor.test (x = Sanford_dead$Age, y = Sanford_dead$BMI, method = 'pearson')
##
##  Pearson's product-moment correlation
##
## data:  Sanford_dead$Age and Sanford_dead$BMI
## t = -0.8219, df = 2144, p-value = 0.4112
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.06001594  0.02458419
## sample estimates:
##         cor
## -0.01774764
cor.test (x = Sanford_dead$BMI, y = Sanford_dead$SmokingStatus, method = 'pearson')
##
##  Pearson's product-moment correlation
##
## data:  Sanford_dead$BMI and Sanford_dead$SmokingStatus
## t = -0.14203, df = 2734, p-value = 0.8871
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.04018583  0.03476072
## sample estimates:
##          cor
## -0.002716367

#### Chunk 4: Testing for a correlation between SmokingStatus and BMI among the entire population.

cor.test (x = Sanford_data$BMI, y = Sanford_data$SmokingStatus, method = 'pearson')
##
##  Pearson's product-moment correlation
##
## data:  Sanford_data$BMI and Sanford_data$SmokingStatus
## t = 2.52, df = 144900, p-value = 0.01174
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.001471188 0.011768433
## sample estimates:
##         cor
## 0.006619986

#### Chunk 5: Jitter Plot created to show SmokingStatus versus Age among dead individuals in the SDC.

ggplot(Sanford_dead, aes(Sanford_dead$SmokingStatus, Sanford_dead$Age)) + geom_jitter(color = "midnightblue", size = 0.75) + theme_bw() + labs(title = "Figure 2.0: Age at Death and Smoking Status of Dead Individuals in the SDC", x = "Smoking Status", y = "Age at Death")
## Warning: Removed 993 rows containing missing values (geom_point).

Sanford_live <- Sanford_data[which(Sanford_data$Status == "Alive"), names(Sanford_data) %in% c("ID", "Sex","Age","BMI","Diabetes","SmokingStatus")] cor.test (x = Sanford_live$Age, y = Sanford_live$SmokingStatus, method = 'pearson') ## ## Pearson's product-moment correlation ## ## data: Sanford_live$Age and Sanford_live$SmokingStatus ## t = 45.696, df = 145770, p-value < 2.2e-16 ## alternative hypothesis: true correlation is not equal to 0 ## 95 percent confidence interval: ## 0.1137748 0.1238968 ## sample estimates: ## cor ## 0.1188389 #### Chunk 7: Created histograms displaying the age distribution among living, deceased, and all members of the SDC. ggplot(Sanford_data, aes(Sanford_data$Age)) + geom_histogram(binwidth = 1, fill = "seashell4") + theme_bw() + labs(title = "Figure 3.0: Frequency of Age of Individuals in SDC", x = "Age", y = "Frequency")
## Warning: Removed 6702 rows containing non-finite values (stat_bin).

ggplot(Sanford_dead, aes(Sanford_dead$Age)) + geom_histogram(binwidth = 1, fill = "seashell4") + theme_bw() + labs(title = "Figure 3.1: Frequency of Age of Deceased Individuals in SDC", x = "Age", y = "Frequency") ## Warning: Removed 993 rows containing non-finite values (stat_bin). ggplot(Sanford_live, aes(Sanford_live$Age)) + geom_histogram(binwidth = 1, fill = "seashell4") + theme_bw() + labs(title = "Figure 3.2: Frequency of Age of Living Individuals in SDC", x = "Age", y = "Frequency")
## Warning: Removed 5709 rows containing non-finite values (stat_bin).

#### Chunk 8: A Histogram of YearBorn is just a mirror image of Age in the living population.

Sanford_live$YearBorn = 2018 - Sanford_live$Age
ggplot(Sanford_live, aes(Sanford_live$YearBorn)) + geom_histogram(binwidth = 1, fill = "seashell4") + theme_bw() + labs(title = "Figure 3.3: Frequency of Year of Birth of Living Individuals in SDC", x = "Age", y = "Frequency") ## Warning: Removed 5709 rows containing non-finite values (stat_bin). #### Chunk 9: Created dataset SmokingHistory showing year born and average smoking status. Line graph showing year born versus average smoking status among the living population. SmokingHistory = aggregate(Sanford_live$SmokingStatus, by = list(Sanford_live$YearBorn), FUN = mean) names(SmokingHistory) <- c("YearBorn", "AverageSmokingStatus") ggplot(SmokingHistory, aes(SmokingHistory$YearBorn, SmokingHistory$AverageSmokingStatus)) + geom_line(color = "goldenrod4", size = 1) +theme_bw() + labs(title = "Figure 4.0: Average Smoking Status of Individuals Born from 1929 to 2000", x = "Year of Birth", y = "Average Smoking Status") cor.test (x = SmokingHistory$YearBorn, y = SmokingHistory$AverageSmokingStatus, method = 'pearson') ## ## Pearson's product-moment correlation ## ## data: SmokingHistory$YearBorn and SmokingHistory\$AverageSmokingStatus
## t = -3.3678, df = 70, p-value = 0.001235
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.5569019 -0.1551636
## sample estimates:
##        cor
## -0.3734094