knitr::opts_chunk$set(comment = NA)
library(Hmisc)
library(janitor)
library(naniar)
library(sessioninfo)
library(kableExtra)
library(broom)
library(ggridges)
library(patchwork)
library(gt); library(gtExtras)
library(car)
library(mosaic)
library(tidyverse)
theme_set(theme_bw())6 Diabetes Prevalence and Social Associations
Among 441 counties from six states (California, Colorado, Indiana, Mississippi, New York, and Ohio)
7 R Packages
8 Data Ingest
data_url <-
"https://www.countyhealthrankings.org/sites/default/files/media/document/analytic_data2023_0.csv"
chr_2023_raw <- read_csv(data_url, skip = 1, guess_max = 4000,
show_col_types = FALSE)
chr_2023_raw <- chr_2023_raw |>
filter(county_ranked == 1)
dim(chr_2023_raw)[1] 3082 720
9 State Selection
We chose states from the four main regions of the US (north, south, east, and west) with a range of 58- 92 counties ( total of 441 counties) to produce a fairly representative sample with a homogeneous distribution.
chr_2023 <- chr_2023_raw |>
filter(state %in% c("CA", "CO", "IN", "MS", "NY", "OH")) |>
mutate(state = factor(state))
dim(chr_2023)[1] 441 720
chr_2023 |> count(state)# A tibble: 6 × 2
state n
<fct> <int>
1 CA 58
2 CO 59
3 IN 92
4 MS 82
5 NY 62
6 OH 88
Our study sample ‘chr_2023’ includes 6 states with a total of 441 counties.
describe(chr_2023$county)chr_2023$county
n missing distinct
441 0 349
lowest : Adams County Alameda County Alamosa County Albany County Alcorn County
highest: Yates County Yazoo County Yolo County Yuba County Yuma County
10 Variable Selection
The variables selected are Diabetes Prevalence v060, High School Completion v168, Insufficient Sleep v143, Premature Age-Adjusted Mortality v127, and Social Associations v140.
chr_2023 <- chr_2023 |>
select(fipscode, state, county, county_ranked,
v060_rawvalue, v140_rawvalue, v168_rawvalue,
v143_rawvalue, v127_rawvalue)Here, we will multiply the three variables v060, v168, and v143 that describe proportions by 100 to obtain percentages, which are easier to understand.
chr_2023 <- chr_2023 |>
mutate(Diabet_Prev = 100*v060_rawvalue,
Highschl_complet = 100*v168_rawvalue,
insuff_sleep = 100*v143_rawvalue,
.keep = "unused") |>
rename(adj_pre_mort = v127_rawvalue,
membershp_assocn = v140_rawvalue)Now we will check our variables.
11 Variable Cleaning and Renaming
| Initial Name | New Name | Role | Description |
|---|---|---|---|
v060_rawvalue |
Diabet_prev |
A1 outcome | Diabetes Prevalence: Percentage of adults aged 20 and above with diagnosed diabetes (age-adjusted) |
v140_rawvalue |
membershp_assocn |
A1 predictor | Social Associations: The count of membership associations for every 10,000 people in the population. |
v127_rawvalue |
adj_pre_mort |
A2 outcome | Premature Age-Adjusted Mortality: Number of deaths among residents under age 75 per 100,000 population (age-adjusted) |
v168_rawvalue |
Highschl_complet |
A2 predictor | High School Completion: Percentage of adults aged 25 and older who have completed high school or obtained an equivalent qualification. |
v143_rawvalue |
insuff_sleep |
A3 outcome | Insufficient Sleep: % of adults who report fewer than 7 hours of sleep on average (age-adjusted) in 2018. |
At this stage we have 441 counties and 9 variables.
dim(chr_2023)[1] 441 9
Our 9 variables are presented by this code:
names(chr_2023)[1] "fipscode" "state" "county" "county_ranked"
[5] "membershp_assocn" "adj_pre_mort" "Diabet_Prev" "Highschl_complet"
[9] "insuff_sleep"
12 Creating the Analysis 2 Predictor
To establish our cut-points, we should look at the 40th and 60th percentiles of the existing data for our planned predictor for Analysis 2, which is Highschl_complet.
chr_2023 |>
summarise(q40 = quantile(Highschl_complet, c(0.4)),
q60 = quantile(Highschl_complet, c(0.6)))# A tibble: 1 × 2
q40 q60
<dbl> <dbl>
1 88.2 90.1
So we will create a three-level variable where values of 88.2 and lower will fall in the Low group, and values of 90.1 and higher will fall in the High group.
chr_2023 <- chr_2023 |>
mutate(Highschl_complet_grp = case_when(
Highschl_complet <= 88.2 ~ "Low",
Highschl_complet >= 90.1 ~ "High")) |>
mutate(Highschl_complet_grp = factor(Highschl_complet_grp))
chr_2023 |> count(Highschl_complet_grp)# A tibble: 3 × 2
Highschl_complet_grp n
<fct> <int>
1 High 175
2 Low 176
3 <NA> 90
Here we have about 40% of our subjects in the High group and about 40% in the Low group, with the rest now listed as missing, and the Highschl_complet_grp variable is now a factor, so that should be acceptable.
13 Adding 2018 Data for the Analysis 3 Outcome
We will add data from 2018, which is five years prior to the 2023 report.
To do so, we imported a file, called projA_chr_2018_raw.csv that contains the FIPS code and other values for each of the 3,078 counties ranked in 2018.
chr_2018_raw <- read_csv("431-projA-chr_2018.csv",
guess_max = 4000,
show_col_types = FALSE)
chr_2018 <- chr_2018_raw |> mutate(fipscode = as.character(fipscode))Now, we will create a tibble called chr_2018 containing just two variables: the fipscode and the variable we are using as our Analysis 3 outcome v143_rawvalue.
chr_2018 <- chr_2018_raw |>
select(fipscode, v143_rawvalue)we will convert the v143_rawvalue which is a proportion into a percentage to keep it consistent with the rest of the variables.
chr_2018 <- chr_2018 |>
mutate(insuff_sleep2 = 100*v143_rawvalue,
.keep = "unused")Here we will convert fipscode variable from numeric to character from both chr_2023 and chr_2018 data sets.
chr_2023$fipscode <- as.character(as.numeric(chr_2023$fipscode))
chr_2018$fipscode <- as.character(as.numeric(chr_2018$fipscode))Now, We will join the two files.
chr_2023 <- left_join(chr_2023, chr_2018, by = "fipscode")We need to rename the two variables which deal with our Analysis 3 outcome.
chr_2023 <- chr_2023 |>
rename(insuff_sleep_2023 = insuff_sleep,
insuff_sleep_2018 = insuff_sleep2)14 Arranging and Saving the Analytic Tibble
chr_2023 <- chr_2023 |>
select(fipscode, state, county,
Diabet_Prev, membershp_assocn,
adj_pre_mort, Highschl_complet_grp,
Highschl_complet,
insuff_sleep_2023, insuff_sleep_2018,
county_ranked)
write_rds(chr_2023, file = "FINAL_CHR2023_AP_SarahShamita.Rds")15 Print the Tibble
chr_2023# A tibble: 441 × 11
fipscode state county Diabet_Prev membershp_assocn adj_pre_mort
<chr> <fct> <chr> <dbl> <dbl> <dbl>
1 6001 CA Alameda County 9.1 6.82 239.
2 6003 CA Alpine County 8.8 0 992.
3 6005 CA Amador County 8.5 7.23 327.
4 6007 CA Butte County 9.4 8.08 408.
5 6009 CA Calaveras County 8.7 8.64 358.
6 6011 CA Colusa County 11.4 5.10 333.
7 6013 CA Contra Costa County 8.8 5.55 244.
8 6015 CA Del Norte County 10.6 3.58 466.
9 6017 CA El Dorado County 7.7 5.91 248.
10 6019 CA Fresno County 12.2 5.20 379.
# ℹ 431 more rows
# ℹ 5 more variables: Highschl_complet_grp <fct>, Highschl_complet <dbl>,
# insuff_sleep_2023 <dbl>, insuff_sleep_2018 <dbl>, county_ranked <dbl>
16 Numerical Summaries
describe(chr_2023)chr_2023
11 Variables 441 Observations
--------------------------------------------------------------------------------
fipscode
n missing distinct
441 0 441
lowest : 18001 18003 18005 18007 18009, highest: 8117 8119 8121 8123 8125
--------------------------------------------------------------------------------
state
n missing distinct
441 0 6
Value CA CO IN MS NY OH
Frequency 58 59 92 82 62 88
Proportion 0.132 0.134 0.209 0.186 0.141 0.200
--------------------------------------------------------------------------------
county
n missing distinct
441 0 349
lowest : Adams County Alameda County Alamosa County Albany County Alcorn County
highest: Yates County Yazoo County Yolo County Yuba County Yuma County
--------------------------------------------------------------------------------
Diabet_Prev
n missing distinct Info Mean pMedian Gmd .05
441 0 99 1 10.29 10.1 2.364 7.2
.10 .25 .50 .75 .90 .95
8.0 8.8 10.0 11.2 13.0 14.6
lowest : 5.6 5.9 6.2 6.4 6.5 , highest: 17.5 17.7 18.3 19.2 19.5
--------------------------------------------------------------------------------
membershp_assocn
n missing distinct Info Mean pMedian Gmd .05
441 0 433 1 10.48 10.49 4.408 4.288
.10 .25 .50 .75 .90 .95
5.593 8.008 10.453 13.073 15.275 16.504
lowest : 0 2.82247 2.84768 3.08356 3.19421
highest: 19.2308 19.5596 20.1808 22.2019 30.8702
--------------------------------------------------------------------------------
adj_pre_mort
n missing distinct Info Mean pMedian Gmd .05
441 0 441 1 422.8 414 145.8 233.7
.10 .25 .50 .75 .90 .95
259.4 338.7 402.2 491.6 588.4 652.9
lowest : 133.412 149.925 156.613 158.992 174.725
highest: 827.965 836.174 837.896 853.055 991.591
--------------------------------------------------------------------------------
Highschl_complet_grp
n missing distinct
351 90 2
Value High Low
Frequency 175 176
Proportion 0.499 0.501
--------------------------------------------------------------------------------
Highschl_complet
n missing distinct Info Mean pMedian Gmd .05
441 0 441 1 87.94 88.59 6.054 77.58
.10 .25 .50 .75 .90 .95
80.38 85.17 89.07 91.46 93.70 94.96
lowest : 56.5591 59.5314 59.5871 70.3531 70.4399
highest: 97.8756 97.9582 98.2858 98.2871 98.6202
--------------------------------------------------------------------------------
insuff_sleep_2023
n missing distinct Info Mean pMedian Gmd .05
441 0 134 1 34.44 34.6 3.707 27.9
.10 .25 .50 .75 .90 .95
29.8 32.6 34.7 36.7 38.3 39.5
lowest : 23.8 24.7 25.5 25.9 26 , highest: 41.4 41.5 41.7 41.8 43.3
--------------------------------------------------------------------------------
insuff_sleep_2018
n missing distinct Info Mean pMedian Gmd .05
438 3 438 1 33.87 34.11 4.325 26.13
.10 .25 .50 .75 .90 .95
28.03 31.81 34.30 36.37 38.47 39.72
lowest : 23.0283 23.2892 23.4431 23.5805 24.4067
highest: 41.671 41.8668 42.4816 42.5945 42.7835
--------------------------------------------------------------------------------
county_ranked
n missing distinct Info Mean
441 0 1 0 1
Value 1
Frequency 441
Proportion 1
--------------------------------------------------------------------------------
17 The Codebook
Our chr_2023 tibble contains 441 counties and 11 variables.
| Variable | Original | Role | NA | Distinct | Definition | Year | Source |
|---|---|---|---|---|---|---|---|
| fipscode | – | ID | 0 | 441 | county’s FIPS code | NA | NA |
| state | – | ID | 0 | 6 | state postal abbreviation | NA | NA |
| county | – | ID | 0 | 349 | county name | NA | NA |
| Diabet_Prev | v060 |
A1 out | 0 | 99 | Diabetes prevalence: Percentage of adults aged 20 and above with diagnosed diabetes (age-adjusted) | 2020 | Behavioral Risk Factor Surveillance System (BRFSS) |
| membershp_assocn | v140 |
A1 pre | 0 | 433 | Social associations: The count of membership associations for every 10,000 people in the population. | 2020 | County Business Patterns |
| adj_pre_mort | v127 |
A2 out | 0 | 441 | Premature Age-Adjusted Mortality: Number of deaths among residents under age 75 per 100,000 population (age-adjusted) | 2018- 2020 | National Center for Health Statistics - Mortality Files |
| Highschl_complet_grp | v168 |
A2 pre | 90 | 2 | High school completion: The percentage of adults aged 25 and older who have completed high school or obtained an equivalent qualification. ( Low is \(\leq\) 88.2%, High is \(\geq\) 90.1% ) | 2017- 2021 | American Community Survey, 5-year estimates |
| Highschl_complet | v168 |
Var 4 | 0 | 441 | Quantitative version of high school completion | 2017- 2021 | American Community Survey, 5-year estimates |
| insuff_sleep_2023 | v143 |
A3 (2023) | 0 | 134 | Insufficient sleep: Percentage of adults who report fewer than 7 hours of sleep on average (age-adjusted) in 2023 | 2020 | Behavioral Risk Factor Surveillance System (BRFSS) |
| insuff_sleep_2018 | v143 |
A3 (2018) | 3 | 438 | Insufficient sleep: Percentage of adults who report fewer than 7 hours of sleep on average (age-adjusted) in 2018 | 2016 | Behavioral Risk Factor Surveillance System (BRFSS) |
| county_ranked | – | Check | 0 | 1 | all values are 1 | NA | NA |
Missingness Check We have no quantitative variables missing more than 3 of our 441 counties (0.6%) which is less than Project A’s limit of 20%.
Distinct Values Check We have at least 99 distinct values in each of our quantitative variables, which is much larger than the minimum count (15) for Project A.
18 Research Questions
18.1 Analysis 1 Research Question
Does increased number of membership associations correlate with lower diabetes prevalence among adults in 441 counties across California, Colorado, Indiana, Mississippi, New York, and Ohio?
The interpersonal and social environments significantly influence an individual’s likelihood of developing chronic diseases, including diabetes. This analysis aims to examine the potential inverse relationship between the number of social associations (measured as membership associations per 10,000 people) and diabetes prevalence among adults in the US population.
18.2 Analysis 2 Research Question
Can we predict meaningful differences in premature age-adjusted mortality, which measures the number of deaths among residents under age 75 per 100,000 population, based on two levels (high and low) of secondary education in 441 counties across California, Colorado, Indiana, Mississippi, New York, and Ohio?
This analysis seeks to determine whether higher levels of secondary education in counties are associated with lower premature mortality rates, potentially providing a tool to address disparities in US populations’ health.
18.3 Analysis 3 Research Question
Are there meaningful differences in the quantity of sleep between the time periods before COVID-19 pandemic (2018) and after COVID-19 (2023) for 441 counties in the states of California, Colorado, Indiana, Mississippi, New York, and Ohio?
COVID-19 caused world-wide changes in health behaviors across various levels. This analysis aims to explore possible shifts in health behaviors (specifically sleep insufficiency) across the US population before the pandemic (2018) and after the pandemic (2023).
19 Analysis 1
19.1 variables
Our research sample comprises 441 counties from six states (California, Colorado, Indiana, Mississippi, New York, and Ohio). The first research question investigates the possible impact of membership associations on diabetes prevalence among these counties.
The outcome is ‘Diabetes Prevalence’ which is the percentage of adults above 20 years of age diagnosed with diabetes and the predictor is ‘membership association’ which is the number of membership associations for every 10,000 people in the population.
Number of counties with complete data on outcome and predictor variables:
Analysis_1 <- chr_2023 |>
filter(complete.cases(chr_2023)) |>
select(membershp_assocn, Diabet_Prev, state, county)
dim(Analysis_1)[1] 348 4
We have 348 counties with complete data on both membership association and diabetes prevalence.
The values of the outcome and predictor for Cuyahoga County, in Ohio:
Analysis_1_Cuyahoga <- chr_2023 |>
filter(state == "OH") |>
filter(county == "Cuyahoga County")|>
select(Diabet_Prev, membershp_assocn)
tibble::glimpse(Analysis_1_Cuyahoga)Rows: 1
Columns: 2
$ Diabet_Prev <dbl> 11.3
$ membershp_assocn <dbl> 9.129534
In Cuyahoga county, the prevalence of diabetes is 11.3 % and the the number of membership associations is 9.13 for every 10,000 individual in the population.
19.2 Summaries
19.2.1 Graphical Summaries of the Outcome-Predictor Association
We will start by exploring our data set before any transformations.
ggplot(data = Analysis_1, aes(x = membershp_assocn, y = Diabet_Prev)) +
geom_point(shape = 1, size = 2, col = "grey50") +
geom_smooth(method = "loess", se = FALSE, formula = y ~ x, col = "red") +
geom_smooth(method = "lm", formula = y ~ x,
se = FALSE, col = "steelblue") +
labs(title = "Number of membership associations vs diabetes prevalence in CHR 2023 data",
subtitle = "with fitted straight line regression model",
x = "Membership association numbers per 10,000", y = "Diabetes Prevalence %",
caption = str_glue("Pearson correlation is ",
round_half_up(cor(Analysis_1$Diabet_Prev, Analysis_1$membershp_assocn),3)))The relationship between the number of membership associations per 10,000 individuals and the prevalence of diabetes is slightly positive. The relationship is not perfectly linear nor are the values strongly clustered around the fitted line. This observation is reflected by the low pearson correlation (0.063). The loes smooth shows a strong indication that the data is non linear. We can also observe many outliers scattered around the plot which indicates that many counties could not be fit in this simple linear model.
Analysis_1temp <- Analysis_1 |>
filter(membershp_assocn > 20 & Diabet_Prev < 10)
Analysis_1temp2 <- Analysis_1 |>
filter( Diabet_Prev > 18)
ggplot(data = Analysis_1, aes(x = membershp_assocn, y = Diabet_Prev)) +
geom_point(shape = 1, size = 2, col = "grey50") +
geom_smooth(method = "loess", se = FALSE, formula = y ~ x, col = "dodgerblue") +
geom_smooth(method = "lm", formula = y ~ x,
se = FALSE, col = "red") +
geom_point(data = Analysis_1temp, size = 2, col = "hotpink") +
geom_point(data = Analysis_1temp2, size = 2, col = "hotpink") +
labs(title = "Number of membership associations vs diabetes prevalence in CHR 2023 data",
subtitle = "with fitted straight line regression model",
x = "Membership association numbers per 10,000", y = "Diabetes Prevalence %",
caption = str_glue("Pearson correlation is ",
round_half_up(cor(Analysis_1$Diabet_Prev, Analysis_1$membershp_assocn),3)))This graph shows some of the problematic outliers in this data set.
Now we will observe any improvements in the fitting of our model after transforming the outcome using three different transformations.
First we will try the square root transformation:
p1 <- ggplot(Analysis_1, aes(x = membershp_assocn, y = Diabet_Prev)) +
geom_point(col = "gray50") +
geom_smooth(method = "loess", se = FALSE,
formula = y ~ x, col = "dodgerblue") +
geom_smooth(method = "lm", se = FALSE,
formula = y ~ x, col = "red", linetype = "dashed") +
labs(title = "Original: Diabetes vs. Memberships")
p2 <- ggplot(Analysis_1, aes(x = membershp_assocn, y = sqrt(Diabet_Prev))) +
geom_point(col = "grey50") +
geom_smooth(method = "loess", se = FALSE,
formula = y ~ x, col = "dodgerblue") +
geom_smooth(method = "lm", se = FALSE,
formula = y ~ x, col = "red", linetype = "dashed") +
labs(title = "Square Root of Diabetes vs. Memberships")
p1 + p2Second, we will try the logarithm transformation.
p1 <- ggplot(Analysis_1, aes(x = membershp_assocn, y = Diabet_Prev)) +
geom_point(col = "grey50") +
geom_smooth(method = "loess", se = FALSE,
formula = y ~ x, col = "dodgerblue") +
geom_smooth(method = "lm", se = FALSE,
formula = y ~ x, col = "red", linetype = "dashed") +
labs(title = "Original: Diabetes vs. Memberships")
p2 <- ggplot(Analysis_1, aes(x = membershp_assocn, y = log(Diabet_Prev))) +
geom_point(col = "grey50") +
geom_smooth(method = "loess", se = FALSE,
formula = y ~ x, col = "dodgerblue") +
geom_smooth(method = "lm", se = FALSE,
formula = y ~ x, col = "red", linetype = "dashed") +
labs(title = "Log of Diabetes vs. Memberships")
p1 + p2Finally, we will try the inverse transformation.
p1 <- ggplot(Analysis_1, aes(x = membershp_assocn, y = Diabet_Prev)) +
geom_point(col = "grey50") +
geom_smooth(method = "loess", se = FALSE,
formula = y ~ x, col = "dodgerblue") +
geom_smooth(method = "lm", se = FALSE,
formula = y ~ x, col = "red", linetype = "dashed") +
labs(title = "Original: Diabetes vs. Memberships")
p2 <- ggplot(Analysis_1, aes(x = membershp_assocn, y = 1/Diabet_Prev)) +
geom_point(col = "grey50") +
geom_smooth(method = "loess", se = FALSE,
formula = y ~ x, col = "dodgerblue") +
geom_smooth(method = "lm", se = FALSE,
formula = y ~ x, col = "red", linetype = "dashed") +
labs(title = "Inverse of Diabetes vs. Memberships")
p1 + p2Unfortunately, none of the transformations was successful in producing a better fit for our simple linear regression. We will use the our original linear model without transformations.
19.2.2 Graphical summary of the outcome distribution
We will assess the distribution of our outcome (Diabetes Prevalence) which is the variable (Diabet_Prev) using three plots: a normal Q-Q plot, a histogram, and a boxplot.
## Normal Q-Q plot
p1 <- ggplot(Analysis_1, aes(sample = Diabet_Prev)) +
geom_qq(col = "steelblue", size = 2) + geom_qq_line(col = "black") +
theme(aspect.ratio = 1) +
labs(title = "Normal Q-Q plot",
y = "Diabetes Prevalence",
x = "Expectation under Standard Normal")
## Histogram with Normal density superimposed
p2 <- ggplot(Analysis_1, aes(Diabet_Prev)) +
geom_histogram(aes(y = after_stat(density)),
bins = 7, fill = "steelblue", col = "lightgray") +
stat_function(fun = dnorm,
args = list(mean = mean(Analysis_1$Diabet_Prev, na.rm = TRUE),
sd = sd(Analysis_1$Diabet_Prev, na.rm = TRUE)),
col = "darkgray", lwd = 1.5) +
labs(title = "Histogram with Normal Density",
x = "Diabetes Prevalence")
## Boxplot with notch and rug
p3 <- ggplot(Analysis_1, aes(x = Diabet_Prev, y = "")) +
geom_boxplot(fill = "steelblue", notch = TRUE,
outlier.color = "steelblue", outlier.size = 2) +
stat_summary(fun = "mean", geom = "point",
shape = 23, size = 3, fill = "lightgray") +
geom_rug(sides = "b") +
labs(title = "Boxplot with Notch and Rug",
x = "Diabetes Prevalence",
y = "")
p1 + (p2 / p3 + plot_layout(heights = c(4,1))) +
plot_annotation(title = "Diabetes Prevalence Among Adults",
subtitle = str_glue("in counties from 6 states: (n = ",
nrow(Analysis_1), ")"),
caption = str_glue("Diabetes Prevalence: Sample Size = ", nrow(Analysis_1),
", Sample Median = ", round_half_up(median(Analysis_1$Diabet_Prev),1),
", Mean = ", round_half_up(mean(Analysis_1$Diabet_Prev),2),
" and SD = ", round_half_up(sd(Analysis_1$Diabet_Prev),2)))We can safely conclude that the outcome distribution is not normal but skewed (right skew) due to many outliers (counties with remarkably high prevalence of Diabetes among adults compared to the sample).
The counties that have been identified as outliers with the highest prevalence of diabetes:
Analysis_1 |> arrange(desc(Diabet_Prev)) |> head(5)# A tibble: 5 × 4
membershp_assocn Diabet_Prev state county
<dbl> <dbl> <fct> <chr>
1 10.2 19.5 MS Holmes County
2 12.8 19.2 MS Humphreys County
3 5.61 18.3 MS Claiborne County
4 8.88 17.7 MS Quitman County
5 10.7 17.5 MS Coahoma County
19.2.3 Numerical Summaries
key1 <-
bind_rows(
favstats(~ Diabet_Prev, data = Analysis_1),
favstats(~ membershp_assocn, data = Analysis_1)) |>
mutate(variable = c("Diabet_Prev", "membershp_assocn")) |>
relocate(variable)
key1 |> kbl(digits = 3) |> kable_styling()| variable | min | Q1 | median | Q3 | max | mean | sd | n | missing | |
|---|---|---|---|---|---|---|---|---|---|---|
| ...1 | Diabet_Prev | 5.6 | 8.800 | 10.100 | 11.500 | 19.50 | 10.436 | 2.404 | 348 | 0 |
| ...2 | membershp_assocn | 0.0 | 7.858 | 10.364 | 12.915 | 30.87 | 10.424 | 3.973 | 348 | 0 |
This sample does not have any missing values.
19.3 Approach
19.3.1 Fitted Model
Now we will fit our model and use our predictor (membershp_assocn) to predict the outcome (Diabet_Prev) in our sample.
ggplot(data = Analysis_1, aes(x = membershp_assocn, y = Diabet_Prev)) +
geom_point(shape = 1, size = 2, col = "grey50") +
geom_smooth(method = "loess", se = FALSE, formula = y ~ x, col = "red") +
geom_smooth(method = "lm", formula = y ~ x,
se = FALSE, col = "steelblue") +
labs(title = "Number of membership associations vs diabetes prevalence in CHR 2023 data",
subtitle = "with fitted straight line regression model",
x = "Membership association numbers per 10,000", y = "Diabetes Prevalence %",
caption = str_glue("Pearson correlation is ",
round_half_up(cor(Analysis_1$Diabet_Prev, Analysis_1$membershp_assocn),3))) +
annotate("text", x = 25, y = 15, col = "red", size = 5,
label = paste("y = ", round_half_up(coef(lm(Analysis_1$Diabet_Prev ~ Analysis_1$membershp_assocn))[1],3),
" + " ,
round_half_up(coef(lm(Analysis_1$Diabet_Prev ~ Analysis_1$membershp_assocn))[2],2), "x"))A1_linear <- lm(Diabet_Prev ~ membershp_assocn, data = Analysis_1)
summary(A1_linear)
Call:
lm(formula = Diabet_Prev ~ membershp_assocn, data = Analysis_1)
Residuals:
Min 1Q Median 3Q Max
-4.8827 -1.6416 -0.3205 1.1082 9.0743
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 10.03962 0.36210 27.73 <2e-16 ***
membershp_assocn 0.03799 0.03246 1.17 0.243
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.403 on 346 degrees of freedom
Multiple R-squared: 0.003942, Adjusted R-squared: 0.001063
F-statistic: 1.369 on 1 and 346 DF, p-value: 0.2427
A1_linear <- lm(Diabet_Prev ~ membershp_assocn, data = Analysis_1)
tidy(A1_linear, conf.int = TRUE, conf.level = 0.90) |>
gt() |> fmt_number(decimals = 3) |> gt_theme_espn()| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| (Intercept) | 10.040 | 0.362 | 27.726 | 0.000 | 9.442 | 10.637 |
| membershp_assocn | 0.038 | 0.032 | 1.170 | 0.243 | −0.016 | 0.092 |
glance(A1_linear) |>
round_half_up(digits = c(4, 4, 3, 2, 3, 0, 0, 1, 1, 0, 0, 0)) |>
gt() |> gt_theme_espn()| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.0039 | 0.0011 | 2.403 | 1.37 | 0.243 | 1 | -798 | 1601.7 | 1613.3 | 1998 | 346 | 348 |
The fitted model equation is Diabet_Prev = 10.04 + 0.04 membershp_assocn. We fit the model to all 348 observations, obtaining a model R squared value of 0.39% and a residual standard deviation of 2.403. It is almost the same as the initial standard deviation of the Diabet_Prev values (2.404) from our earlier numerical summaries.
Analysis_1_Residual <- lm(Diabet_Prev ~ membershp_assocn, data = Analysis_1)
par(mfrow = c(1,2)); plot(Analysis_1_Residual, which = 1:2); par(mfrow = c(1,1))The scatter plot on the left is the residuals vs fit plot for the data set’s simple linear regression model with diabetes prevalence as the outcome and the number of membership association per 10,000 people as the predictor. In this plot, the residuals (estimated error) are plotted on the y axis and fitted values (observed outcome) on the x axis. It asses the linearity of the data, presence of unequal error variances, and outliers. As we can see the data points are clustered around the middle rather than being spread around the horizontal line. This suggests that the relationship is non linear. The spread of the data points are unequal which indicates unequal error variance. At the bottom left, there is a residual that stands out compared to the pattern of the majority of the residuals, which can be considered an outlier.
The plot on the right shows a normal Q-Q plot which asses the data skew and model fit. The standardized residuals are plotted on the y axis and the theoretical quantiles are on the x axis. As we can see there is a large skew in the residuals where the points are drastically deviating from the dotted line. This indicates that the data are not normally distributed and that a linear model may not be a suitable method for fitting our sample.
Now we will identify the two counties where the model we’ve fit was the least successful in predicting the outcome (AKA the sample Outliers):
A1_aug <- augment(A1_linear, data = Analysis_1)
A1_aug |> slice(182, 183) |>
select(state, county, .fitted, .resid, .std.resid) |>
gt() |> gt_theme_espn()| state | county | .fitted | .resid | .std.resid |
|---|---|---|---|---|
| MS | Holmes County | 10.42574 | 9.074261 | 3.781979 |
| MS | Humphreys County | 10.52498 | 8.675015 | 3.617391 |
Finally, we will display our model’s prediction for our outcome (Diabetes Prevalence) for Cuyahoga County.
Cuyahoga_predict <- predict(A1_linear, newdata = subset(chr_2023, county == "Cuyahoga County" & state == "OH"))
Cuyahoga_actual <- subset(chr_2023, county == "Cuyahoga County" & state == "OH")$Diabet_Prev
paste("Predicted Value for Cuyahoga County", Cuyahoga_predict)[1] "Predicted Value for Cuyahoga County 10.3864478265588"
paste("Actual Value for Cuyahoga County", Cuyahoga_actual)[1] "Actual Value for Cuyahoga County 11.3"
The predicted value of the outcome (Diabetes Prevalence) for Cuyahoga County was 10.4, while the actual value of Diabetes Prevalence for Cuyahoga County was 11.3.
19.4 Conclusions
Our research question was whether the number of membership associations was inversely associated with diabetes prevalence among 441 counties from six states ( California, Colorado, Indiana, Mississippi, New York, and Ohio). The simple linear regression model produced from this sample provided a unexpected answer. Number of membership associations per 10,000 people had a positive relationship with diabetes prevalence among adults in the sample counties. As opposed to our initial belief, increased number of membership associations increased diabetes prevalence. However, the strength of the association was very weak (pearson correlation= 0.063). From our results, we can tell that membership associations may not be the best predictor for diabetes prevalence among adults in US counties.
There some limitations to our work in Analysis 1. One of the major limitations is that the relationship between the variables is non linear. The distribution of the residuals was also not normal. Our sample does not follow the assumption of normality nor linearity which means that using a simple linear regression model may not be appropriate. In order to prove that, additional transformation should be done for the predictor. Another step would be to investigate the outliers and see whether they can be removed to improve the fit of our model. The sample counties were extracted from six randomly selected states. Stratifying the analysis by state can be another approach to yield better predictions.
20 Analysis 2
20.1 Variables
In this analysis, our aim is to investigate whether there are significant differences in premature age-adjusted mortality, which quantifies the number of deaths among residents under age 75 per 100,000 population, based on high and low levels of secondary education completion in 441 counties across California, Colorado, Indiana, Mississippi, New York, and Ohio. The primary objective of this study is to evaluate whether higher levels of secondary education completion in counties are associated with lower premature mortality rates, potentially providing a valuable tool to address disparities in population health.
Outcome Variable:
Premature Age-Adjusted Mortality (adj_pre_mort) : This is our quantitative outcome variable, representing the number of deaths among residents under the age of 75 per 100,000 population in a county. This variable measures the premature mortality rate in each county, which is a critical indicator of public health.
Predictor Variable:
Level of Secondary Education (Highschl_complet_grp): This is our two-level predictor variable, which categorizes the counties into two groups based on the level of secondary education. The two levels are: High Secondary Education: Counties with a higher level of secondary education. Low Secondary Education: Counties with a lower level of secondary education. Our research is centered on understanding whether there are statistically significant differences in premature age-adjusted mortality rates between these two groups of counties. Such differences may indicate an association between secondary education and disparities in population health.
There are no missing values for either the premature age-adjusted mortality or secondary education levels in our sample. Our original sample was composed of 441 counties from six randomly selected states. However, after dichotomizing our predictor (High school completion) into two levels (High and low), our current analysis sample was reduced to 351 counties.
20.2 Summaries
20.2.1 sample prep
We will create a sub sample for analysis 2 to include our predictor ‘Highschl_complet_grp’ with its two levels (High -low) and our outcome ‘adj_pre_mort’.
Analysis_2 <- chr_2023 |>
filter(complete.cases(Highschl_complet_grp)) |>
select(state, county, Highschl_complet_grp, adj_pre_mort)
tibble::glimpse(Analysis_2)Rows: 351
Columns: 4
$ state <fct> CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, C…
$ county <chr> "Alpine County", "Amador County", "Butte County",…
$ Highschl_complet_grp <fct> High, High, High, High, Low, Low, High, Low, Low,…
$ adj_pre_mort <dbl> 991.5913, 326.9962, 407.5274, 357.5707, 333.1104,…
Our sample size was reduced to 351 counties with complete data on both variables.
20.2.2 Distribution of the Outcome within Predictor Groups
A comprehensive visualization was created using ggplot2 in R to analyze the relationship between the predictor ‘Highschl_complet_grp’ and the outcome ‘adj_pre_mort’. The visualization includes violin plots to show data density, notched boxplots to provide insights into central tendency and spread, and red points to indicate the mean values. This plot allows for a comparison of how different levels of secondary education may impact the distribution of premature mortality.
ggplot(Analysis_2, aes(x = adj_pre_mort, y = Highschl_complet_grp)) +
geom_violin(fill = "skyblue", color = "blue") +
geom_boxplot(width = 0.3, fill = "lightgray", notch = TRUE,
outlier.color = "red", outlier.size = 3) +
stat_summary(fun = "mean", geom = "point", shape = 23, size = 3,
fill = "red") +
labs(x = "Premature Age-Adjusted Mortality per 100,000 population", y = "Level of Secondary Education completion",
title = "Analysis of Premature Mortality and Secondary Education Completion",
subtitle = "Among 351 US counties")The ‘High’ group of secondary education completion have a lower premature age-adjusted mortality median compared to the ‘low’ group.
One of the counties with extreme premature age-adjusted mortality value despite being in the ‘High’ secondary education completion group is Alpine County in California.
Analysis_2 |> arrange(desc(adj_pre_mort)) |> head(3)# A tibble: 3 × 4
state county Highschl_complet_grp adj_pre_mort
<fct> <chr> <fct> <dbl>
1 CA Alpine County High 992.
2 MS Coahoma County Low 853.
3 MS Sunflower County Low 838.
# Visualization for Counties with High Secondary Education
p1 <- ggplot(
data = Analysis_2 |> filter(Highschl_complet_grp == "High"), aes(sample = adj_pre_mort)) +
geom_qq() + geom_qq_line(col = "blue") +
theme(aspect.ratio = 1) +
labs(title = "High Education Completion Counties",
y = "Premature Age-Adjusted Mortality", x = "Standard Normal Expectation")
# Visualization for Counties with Low Secondary Education
p2 <- ggplot(
data = Analysis_2 |> filter(Highschl_complet_grp == "Low"), aes(sample = adj_pre_mort)) +
geom_qq() + geom_qq_line(col = "blue") +
theme(aspect.ratio = 1) +
labs(title = "Low Education Completion Counties",
y = "Premature Age-Adjusted Mortality", x = "Standard Normal Expectation")
p1 + p2Both populations are approximately normally distributed.
20.3 Numerical Summary
The outline the distribution of ‘ADJ_PRE_MORT’ across “High” and “Low” educational groups. The median values reveal the center of the distributions, where “Low” educational group tends to have a higher median ADJ_PRE_MORT value compared to the “High” educational group. Additionally, there’s a notable difference in the quartiles, means, and standard deviations, suggesting that “Low” educational group has higher variability and potentially higher mortality rates compared to “High” educational group.
key2 <- favstats(adj_pre_mort ~ Highschl_complet_grp, data = Analysis_2)
key2 |> kbl(digits = 2) |> kable_styling()| Highschl_complet_grp | min | Q1 | median | Q3 | max | mean | sd | n | missing |
|---|---|---|---|---|---|---|---|---|---|
| High | 133.41 | 294.87 | 363.09 | 410.19 | 991.59 | 354.72 | 97.53 | 175 | 0 |
| Low | 175.26 | 392.16 | 494.60 | 578.43 | 853.05 | 501.48 | 141.65 | 176 | 0 |
20.4 Approach
20.4.1 90% CI for difference in group means via t-based procedure
Analysis_2 |> group_by(Highschl_complet_grp) |>
summarise(n = n(), var(adj_pre_mort))# A tibble: 2 × 3
Highschl_complet_grp n `var(adj_pre_mort)`
<fct> <int> <dbl>
1 High 175 9513.
2 Low 176 20064.
The variance values indicates a substantial discrepancy between the premature age-adjusted mortality rates among the “Low” and “High” secondary education groups. Generally, if one group’s variance is at least 50% greater than the other, it could raise more concern about the disparity in variances and potentially require additional analysis. In this case, the variance difference is substantial, warranting consideration for its implication. Since the variance value is notably different, it is sensible to use other methods which account for unequal variances between the groups. The Welch’s t test is known for handling unequal variance and it is a robust statistical approach in this situation. By utilizing the Welch test the analysis takes into account the significant differences in variance. providing more reliable results without the necessity of assuming equal population variances.
t.test(adj_pre_mort ~ Highschl_complet_grp, data = Analysis_2,
conf.int = TRUE, conf.level = 0.90) |>
tidy() |>
kbl(digits = 3) |> kable_styling()| estimate | estimate1 | estimate2 | statistic | p.value | parameter | conf.low | conf.high | method | alternative |
|---|---|---|---|---|---|---|---|---|---|
| -146.765 | 354.717 | 501.482 | -11.311 | 0 | 310.644 | -168.171 | -125.359 | Welch Two Sample t-test | two.sided |
The two sample t-test approach assumes equal variances.
t.test(adj_pre_mort ~ Highschl_complet_grp, data = Analysis_2,var.equal = TRUE,
conf.int = TRUE, conf.level = 0.90) |>
tidy() |>
kbl(digits = 3) |> kable_styling()| estimate | estimate1 | estimate2 | statistic | p.value | parameter | conf.low | conf.high | method | alternative |
|---|---|---|---|---|---|---|---|---|---|
| -146.765 | 354.717 | 501.482 | -11.3 | 0 | 349 | -168.186 | -125.344 | Two Sample t-test | two.sided |
Another way to get the “equal variances” in 90% confidence interval:
m3 <- lm(adj_pre_mort ~ Highschl_complet_grp, data = Analysis_2)
tidy(m3, conf.int = TRUE, conf.level = 0.90) |>
kbl(digits = 3) |> kable_styling()| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| (Intercept) | 354.717 | 9.197 | 38.568 | 0 | 339.549 | 369.886 |
| Highschl_complet_grpLow | 146.765 | 12.988 | 11.300 | 0 | 125.344 | 168.186 |
20.4.2 90% CI for difference in group means via bootstrap
Our 90% bootstrap confidence interval for the difference in population means (High - low) in ‘Highschl_complet_grp’ is (126.26, 168.00) with a point estimate of 146.76. The mean difference does not equal 0 and the range does not include 0. We can safely conclude that the level of secondary education completion is associated with premature age-adjusted mortality among our sample counties.
The ‘Low’ group exhibited higher variance compared to the ‘High’ group. The presence of variance between the independent samples, along with the disparities between the groups, suggests that the predictor can significantly impact the outcome. Therefore, it is crucial to further examine the significance of the difference in premature age-adjusted mortality rates across secondary education completion classifications.
20.5 Conclusions
In our investigation across 351 counties (originally 441) from from six randomly selected states (California, Colorado, Indiana, Mississippi, New York, and Ohio), we aimed to explore the relationship between two levels of secondary education completion (High - low) and premature age-adjusted mortality rates. Premature age-adjusted mortality was measured among residents under age of 75 years old per 100,000 population. The findings showed substantial discrepancy in premature age-adjusted mortality rates between counties designated as ‘High’ and ‘Low’ in secondary education completion. The 90% confidence interval (126.26, 168.00) and point estimate (146.76) demonstrated a noteworthy disparity. This indicates a considerable inequality in premature mortality rates across secondary education completion classifications.
The results consistently affirm marked variations in premature age-adjusted mortality rates between ‘High’ and ‘Low’ counties. Yet, there are limitations to the analysis. Scrutiny is essential regarding assumptions behind interval selections, particularly concerning data normality and assumptions of equal variances. Furthermore, the representative of the selected states remains uncertain. For a more robust analysis, considering a more diverse and representative set of states or regions across the US, along with investigating additional factors such as healthcare access and socioeconomic conditions, could significantly enhance insights into these mortality rate disparities.
21 Analysis 3
21.1 Variables
In analysis 3, our outcome is the quantity of sleep (sleep insufficiency). The measurement is the percentage of adults who report fewer than 7 hours of sleep on average (age-adjusted). The sample includes 441 counties from 6 states (California, Colorado, Indiana, Mississippi, New York, and Ohio). The outcome will be measured in two different time points before and after the COVID-19 pandemic (2018 and 2023) to evaluate the presence of any remarkable changes in sleep insufficiency among our study sample Pre and Post pandemic period.
The sleep insufficiency outcome is divided into two variables. The first variable is the 2018 sleep insufficiency ‘(insuff_sleep_2018)’ which was collected from Behavioral Risk Factor Surveillance System in 2016. The second variable is the 2023 sleep insufficiency ‘(insuff_sleep_2023)’ which was collected from the Behavioral Risk Factor Surveillance System in 2020.
21.2 Summaries
21.2.1 Preparing the paired sample
First we will filter Analysis 3 data set.
Analysis_3 <- chr_2023 |>
filter(complete.cases(chr_2023)) |>
select(insuff_sleep_2023, insuff_sleep_2018)Second, we will add the a column calculating the paired difference between insufficient sleep in 2023 and insufficient sleep in 2018.
Org_Analysis_3 <- Analysis_3
Analysis_3 <- Org_Analysis_3 |>
mutate(insuff_sleep_dif = insuff_sleep_2023 - insuff_sleep_2018)
Analysis_3# A tibble: 348 × 3
insuff_sleep_2023 insuff_sleep_2018 insuff_sleep_dif
<dbl> <dbl> <dbl>
1 33.8 30.2 3.63
2 33.7 31.7 2.01
3 32.4 30.8 1.59
4 33.8 33.3 0.472
5 35 32.5 2.47
6 31.6 30.6 1.02
7 32.7 34.8 -2.06
8 33.7 34.0 -0.331
9 34.8 32.2 2.62
10 34.7 33.8 0.928
# ℹ 338 more rows
21.2.2 Distribution of the paired differences
Now we will visualize our sample’s distribution:
## Normal Q-Q plot
p1 <- ggplot(Analysis_3, aes(sample = insuff_sleep_dif)) +
geom_qq(col = "dodgerblue", size = 2) + geom_qq_line(col = "steelblue") +
theme(aspect.ratio = 1) +
labs(title = "Normal Q-Q plot",
y = "2023 - 2018 insufficient sleep",
x = "Expectation under Standard Normal")
## Histogram with Normal density superimposed
p2 <- ggplot(Analysis_3, aes(insuff_sleep_dif)) +
geom_histogram(aes(y = after_stat(density)),
bins = 7, fill = "steelblue", col = "lightgray") +
stat_function(fun = dnorm,
args = list(mean = mean(Analysis_3$insuff_sleep_dif, na.rm = TRUE),
sd = sd(Analysis_3$insuff_sleep_dif, na.rm = TRUE)),
col = "darkgray", lwd = 1.5) +
labs(title = "Histogram with Normal Density",
x = "2023 - 2018 insufficient sleep")
## Boxplot with notch and rug
p3 <- ggplot(Analysis_3, aes(x = insuff_sleep_dif, y = "")) +
geom_boxplot(fill = "steelblue", notch = TRUE,
outlier.color = "steelblue", outlier.size = 2) +
stat_summary(fun = "mean", geom = "point",
shape = 23, size = 3, fill = "lightgray") +
geom_rug(sides = "b") +
labs(title = "Boxplot with Notch and Rug",
x = "2023 - 2018 insufficient sleep",
y = "")
p1 + (p2 / p3 + plot_layout(heights = c(4,1))) +
plot_annotation(title = "Difference in Sleep Insufficiency % Between 2023 and 2018",
subtitle = str_glue("Among adults in six US counties (n = ",
nrow(Analysis_3), ")"),
caption = str_glue("Paired Differences: Sample Size = ", nrow(Analysis_3),
", Sample Median = ", round_half_up(median(Analysis_3$insuff_sleep_dif),2),
", Mean = ", round_half_up(mean(Analysis_3$insuff_sleep_dif),2),
" and SD = ", round_half_up(sd(Analysis_3$insuff_sleep_dif),2)))21.2.3 How well did the pairing work for this sample?
ggplot(Analysis_3, aes(x = insuff_sleep_2018, y = insuff_sleep_2023)) +
geom_point(size = 2) +
geom_smooth(method = "lm", formula = y ~ x, se = FALSE) +
labs(title = "Paired Samples for Insufficient Sleep Study",
x = "Insufficient sleep percentage in 2018",
y = "Insufficient sleep percentage in 2023",
subtitle = "348 pairs of counties with sleep insufficiency percentage measured in 2018 and 2023",
caption = str_glue("Pearson correlation = ", round_half_up(cor(Analysis_3$insuff_sleep_2018, Analysis_3$insuff_sleep_2023),2)))As we can see, the plot displays a positive correlation between the 2018 and the 2023 samples with a strong Pearson correlation (0.86). This indicated that pairing worked well in reducing variation among our study sample.
21.2.4 Numerical Summaries
key3 <- bind_rows(
favstats(~ insuff_sleep_dif, data = Analysis_3),
favstats(~ insuff_sleep_2023, data = Analysis_3),
favstats(~ insuff_sleep_2018, data = Analysis_3)) |>
mutate(group = c("Insufficient sleep difference", "Insufficient sleep 2023", "Insufficient sleep 2018")) |>
relocate(group)
key3 |> kbl(digits = 3) |> kable_styling()| group | min | Q1 | median | Q3 | max | mean | sd | n | missing | |
|---|---|---|---|---|---|---|---|---|---|---|
| ...1 | Insufficient sleep difference | -4.647 | -0.687 | 0.598 | 1.867 | 9.762 | 0.583 | 2.030 | 348 | 0 |
| ...2 | Insufficient sleep 2023 | 23.800 | 32.600 | 34.750 | 36.600 | 43.300 | 34.400 | 3.364 | 348 | 0 |
| ...3 | Insufficient sleep 2018 | 23.028 | 31.913 | 34.284 | 36.316 | 42.783 | 33.816 | 3.945 | 348 | 0 |
21.3 Approach
To determine which test is most appropriate for building the 90% CI for the population mean difference, we will need to observe the distribution of each sample.
First we will retrieve the original data and slightly change it to a ‘Longer Format’ to serve our purposes:
Analysis_3_longer <- Org_Analysis_3 |>
pivot_longer(
cols = -c(),
names_to = "status",
values_to = "insufficient_sleep_percentage")
Analysis_3_longer# A tibble: 696 × 2
status insufficient_sleep_percentage
<chr> <dbl>
1 insuff_sleep_2023 33.8
2 insuff_sleep_2018 30.2
3 insuff_sleep_2023 33.7
4 insuff_sleep_2018 31.7
5 insuff_sleep_2023 32.4
6 insuff_sleep_2018 30.8
7 insuff_sleep_2023 33.8
8 insuff_sleep_2018 33.3
9 insuff_sleep_2023 35
10 insuff_sleep_2018 32.5
# ℹ 686 more rows
ggplot(Analysis_3_longer, aes(x = status, y = insufficient_sleep_percentage)) +
geom_violin() +
geom_boxplot(aes(fill = status), width = 0.2) +
stat_summary(fun = "mean", geom = "point",
shape = 23, size = 3, fill = "blue") +
scale_fill_viridis_d(alpha = 0.5) +
guides(fill = "none") +
coord_flip() +
labs(title = "Boxplot of Insufficient Sleep % among adults in 2023 and 2018",
subtitle = "Sample includes 348 counties from 6 States") Another useful visualization to asses the sample’s distribution:
ggplot(Analysis_3_longer, aes(x = insufficient_sleep_percentage, y = status, fill = status)) +
geom_density_ridges(scale = 0.9) +
guides(fill = "none") +
labs(title = "Insufficient Sleep % among adults in 2023 and 2018",
subtitle = "Sample includes 348 counties from 6 States") +
theme_ridges()Picking joint bandwidth of 0.875
A statistical summary of our two paired samples:
mosaic::favstats(insufficient_sleep_percentage ~ status, data = Analysis_3_longer) |>
kbl(digits = 2) |> kable_styling()| status | min | Q1 | median | Q3 | max | mean | sd | n | missing |
|---|---|---|---|---|---|---|---|---|---|
| insuff_sleep_2018 | 23.03 | 31.91 | 34.28 | 36.32 | 42.78 | 33.82 | 3.95 | 348 | 0 |
| insuff_sleep_2023 | 23.80 | 32.60 | 34.75 | 36.60 | 43.30 | 34.40 | 3.36 | 348 | 0 |
We can conclude that the distribution of the paired difference and the separate samples are approximately normal with mild skew. The paired samples also have close variation.
21.3.1 The 90% CI for mean paired differences via t-based procedure:
t1 <- t.test(Analysis_3$insuff_sleep_dif, conf.level = 0.9)
tidy(t1)|> kbl(digits = 3) |> kable_styling()| estimate | statistic | p.value | parameter | conf.low | conf.high | method | alternative |
|---|---|---|---|---|---|---|---|
| 0.583 | 5.362 | 0 | 347 | 0.404 | 0.763 | One Sample t-test | two.sided |
21.3.2 The 90% CI for mean paired differences via Boots strap based procedure :
t2 <-
set.seed(431)
smean.cl.boot(Analysis_3$insuff_sleep_dif, B = 2000, conf.int = 0.9)|> kbl(digits = 3) |> kable_styling()| x | |
|---|---|
| Mean | 0.583 |
| Lower | 0.414 |
| Upper | 0.772 |
Our sample satisfies the assumptions of t test which require normal distribution of the population or the sample mean at least and that the paired samples were selected at random, and that the sampled paired differences are drawn from the population set of paired differences independently and have identical distributions.
In this case we will use the t-based 90% CI to calculate the mean differences for our independent two samples.
21.4 Conclusions
Our research question was whether there was meaningful difference in sleep insufficiency among adults between the years 2018 and 2023 (pre and COVID-19 pandemic) from 441 US counties. Our initial belief is that there will be meaningful difference and that sleep insufficiency proportion would increase in 2023. Our assumption was based on the fact that many health behaviors showed considerable decline in the post pandemic period. Our analysis result aligned closely with our expectations. The independent samples t-test 90% Confidence interval was 0.404, 0.763 with a point estimate of 0.583. The mean difference is positive and the confidence interval does not include 0. We can safely conclude that there is a meaningful increase in sleep insufficiency proportion among adults in our study sample.
However, limitations exist in the county selection as well as potential issues with paired data. Addressing these concerns involves refining confidence intervals and selecting a more diverse county sample to improve the analysis’ robustness. The study’s limitations stem from the selected counties potentially not accurately representing the broader US population. Issues with pairing data from 2018 and 2023 pose concerns regarding the validity of the comparison. To enhance this analysis, refining the confidence intervals and broadening the county selection to capture a more comprehensive demographic spectrum could strengthen the study’s generalizability. Additionally, exploring contributing factors and collecting qualitative data may offer a richer understanding of sleep behavior changes, further improving the analysis.
22 Portfolio Reflection
During the development of our portfolio for project A, we faced some challenges in the analytical phases. One of the major obstacles we encountered was understanding and interpreting residual analysis in Analysis 1. It was a relatively new concept for us and we had little experience in handling that kind of work. Interpreting the fitted versus residual plot required time and effort to comprehend the concept and its implications for our analysis. Another obstacle was coding for analysis 2, where we inadvertently excluded the ‘Highschl_complet_grp’ variable in the code. To overcome this, we had to review our code and incorporate the necessary categorical variables (‘High’ and ‘Low’) to rectify the variable loss. Despite these challenges, we gained valuable insights into refining our analytical techniques and ensuring a more robust approach in future projects.
23 Session Information
session_info()Warning in system2("quarto", "-V", stdout = TRUE, env = paste0("TMPDIR=", :
running command '"quarto"
TMPDIR=C:/Users/albal/AppData/Local/Temp/RtmpGmIh0R/file2b785b703673 -V' had
status 1
─ Session info ───────────────────────────────────────────────────────────────
setting value
version R version 4.5.1 (2025-06-13 ucrt)
os Windows 11 x64 (build 26100)
system x86_64, mingw32
ui RTerm
language (EN)
collate English_United States.utf8
ctype English_United States.utf8
tz America/New_York
date 2025-10-28
pandoc 3.4 @ C:/Program Files/RStudio/resources/app/bin/quarto/bin/tools/ (via rmarkdown)
quarto NA @ C:\\PROGRA~1\\RStudio\\RESOUR~1\\app\\bin\\quarto\\bin\\quarto.exe
─ Packages ───────────────────────────────────────────────────────────────────
package * version date (UTC) lib source
abind 1.4-8 2024-09-12 [1] CRAN (R 4.5.0)
backports 1.5.0 2024-05-23 [1] CRAN (R 4.5.0)
base64enc 0.1-3 2015-07-28 [1] CRAN (R 4.5.0)
bit 4.6.0 2025-03-06 [1] CRAN (R 4.5.1)
bit64 4.6.0-1 2025-01-16 [1] CRAN (R 4.5.1)
broom * 1.0.10 2025-09-13 [1] CRAN (R 4.5.1)
car * 3.1-3 2024-09-27 [1] CRAN (R 4.5.1)
carData * 3.0-5 2022-01-06 [1] CRAN (R 4.5.1)
checkmate 2.3.3 2025-08-18 [1] CRAN (R 4.5.1)
cli 3.6.5 2025-04-23 [1] CRAN (R 4.5.1)
cluster 2.1.8.1 2025-03-12 [2] CRAN (R 4.5.1)
colorspace 2.1-2 2025-09-22 [1] CRAN (R 4.5.1)
crayon 1.5.3 2024-06-20 [1] CRAN (R 4.5.1)
curl 7.0.0 2025-08-19 [1] CRAN (R 4.5.1)
data.table 1.17.8 2025-07-10 [1] CRAN (R 4.5.1)
digest 0.6.37 2024-08-19 [1] CRAN (R 4.5.1)
dplyr * 1.1.4 2023-11-17 [1] CRAN (R 4.5.1)
evaluate 1.0.5 2025-08-27 [1] CRAN (R 4.5.1)
farver 2.1.2 2024-05-13 [1] CRAN (R 4.5.1)
fastmap 1.2.0 2024-05-15 [1] CRAN (R 4.5.1)
fontawesome 0.5.3 2024-11-16 [1] CRAN (R 4.5.1)
fontBitstreamVera 0.1.1 2017-02-01 [1] CRAN (R 4.5.0)
fontLiberation 0.1.0 2016-10-15 [1] CRAN (R 4.5.0)
fontquiver 0.2.1 2017-02-01 [1] CRAN (R 4.5.1)
forcats * 1.0.1 2025-09-25 [1] CRAN (R 4.5.1)
foreign 0.8-90 2025-03-31 [2] CRAN (R 4.5.1)
Formula 1.2-5 2023-02-24 [1] CRAN (R 4.5.0)
fs 1.6.6 2025-04-12 [1] CRAN (R 4.5.1)
gdtools 0.4.4 2025-10-06 [1] CRAN (R 4.5.1)
generics 0.1.4 2025-05-09 [1] CRAN (R 4.5.1)
ggformula * 1.0.0 2025-10-06 [1] CRAN (R 4.5.1)
ggiraph 0.9.2 2025-10-07 [1] CRAN (R 4.5.1)
ggplot2 * 4.0.0 2025-09-11 [1] CRAN (R 4.5.1)
ggridges * 0.5.7 2025-08-27 [1] CRAN (R 4.5.1)
glue 1.8.0 2024-09-30 [1] CRAN (R 4.5.1)
gridExtra 2.3 2017-09-09 [1] CRAN (R 4.5.1)
gt * 1.1.0 2025-09-23 [1] CRAN (R 4.5.1)
gtable 0.3.6 2024-10-25 [1] CRAN (R 4.5.1)
gtExtras * 0.6.1 2025-10-10 [1] CRAN (R 4.5.1)
haven 2.5.5 2025-05-30 [1] CRAN (R 4.5.1)
Hmisc * 5.2-4 2025-10-05 [1] CRAN (R 4.5.1)
hms 1.1.4 2025-10-17 [1] CRAN (R 4.5.1)
htmlTable 2.4.3 2024-07-21 [1] CRAN (R 4.5.1)
htmltools 0.5.8.1 2024-04-04 [1] CRAN (R 4.5.1)
htmlwidgets 1.6.4 2023-12-06 [1] CRAN (R 4.5.1)
janitor * 2.2.1 2024-12-22 [1] CRAN (R 4.5.1)
jsonlite 2.0.0 2025-03-27 [1] CRAN (R 4.5.1)
kableExtra * 1.4.0 2024-01-24 [1] CRAN (R 4.5.1)
knitr 1.50 2025-03-16 [1] CRAN (R 4.5.1)
labeling 0.4.3 2023-08-29 [1] CRAN (R 4.5.0)
labelled 2.15.0 2025-09-16 [1] CRAN (R 4.5.1)
lattice * 0.22-7 2025-04-02 [2] CRAN (R 4.5.1)
lifecycle 1.0.4 2023-11-07 [1] CRAN (R 4.5.1)
lubridate * 1.9.4 2024-12-08 [1] CRAN (R 4.5.1)
magrittr 2.0.4 2025-09-12 [1] CRAN (R 4.5.1)
MASS 7.3-65 2025-02-28 [2] CRAN (R 4.5.1)
Matrix * 1.7-3 2025-03-11 [2] CRAN (R 4.5.1)
mgcv 1.9-3 2025-04-04 [2] CRAN (R 4.5.1)
mosaic * 1.9.2 2025-07-30 [1] CRAN (R 4.5.1)
mosaicCore 0.9.5 2025-07-30 [1] CRAN (R 4.5.1)
mosaicData * 0.20.4 2023-11-05 [1] CRAN (R 4.5.1)
naniar * 1.1.0 2024-03-05 [1] CRAN (R 4.5.1)
nlme 3.1-168 2025-03-31 [2] CRAN (R 4.5.1)
nnet 7.3-20 2025-01-01 [2] CRAN (R 4.5.1)
paletteer 1.6.0 2024-01-21 [1] CRAN (R 4.5.1)
patchwork * 1.3.2 2025-08-25 [1] CRAN (R 4.5.1)
pillar 1.11.1 2025-09-17 [1] CRAN (R 4.5.1)
pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.5.1)
purrr * 1.1.0 2025-07-10 [1] CRAN (R 4.5.1)
R6 2.6.1 2025-02-15 [1] CRAN (R 4.5.1)
RColorBrewer 1.1-3 2022-04-03 [1] CRAN (R 4.5.0)
Rcpp 1.1.0 2025-07-02 [1] CRAN (R 4.5.1)
readr * 2.1.5 2024-01-10 [1] CRAN (R 4.5.1)
rematch2 2.1.2 2020-05-01 [1] CRAN (R 4.5.1)
rlang 1.1.6 2025-04-11 [1] CRAN (R 4.5.1)
rmarkdown 2.30 2025-09-28 [1] CRAN (R 4.5.1)
rpart 4.1.24 2025-01-07 [2] CRAN (R 4.5.1)
rstudioapi 0.17.1 2024-10-22 [1] CRAN (R 4.5.1)
S7 0.2.0 2024-11-07 [1] CRAN (R 4.5.1)
sass 0.4.10 2025-04-11 [1] CRAN (R 4.5.1)
scales 1.4.0 2025-04-24 [1] CRAN (R 4.5.1)
sessioninfo * 1.2.3 2025-02-05 [1] CRAN (R 4.5.1)
snakecase 0.11.1 2023-08-27 [1] CRAN (R 4.5.1)
stringi 1.8.7 2025-03-27 [1] CRAN (R 4.5.0)
stringr * 1.5.2 2025-09-08 [1] CRAN (R 4.5.1)
svglite 2.2.1 2025-05-12 [1] CRAN (R 4.5.1)
systemfonts 1.3.1 2025-10-01 [1] CRAN (R 4.5.1)
textshaping 1.0.4 2025-10-10 [1] CRAN (R 4.5.1)
tibble * 3.3.0 2025-06-08 [1] CRAN (R 4.5.1)
tidyr * 1.3.1 2024-01-24 [1] CRAN (R 4.5.1)
tidyselect 1.2.1 2024-03-11 [1] CRAN (R 4.5.1)
tidyverse * 2.0.0 2023-02-22 [1] CRAN (R 4.5.1)
timechange 0.3.0 2024-01-18 [1] CRAN (R 4.5.1)
tzdb 0.5.0 2025-03-15 [1] CRAN (R 4.5.1)
utf8 1.2.6 2025-06-08 [1] CRAN (R 4.5.1)
vctrs 0.6.5 2023-12-01 [1] CRAN (R 4.5.1)
viridisLite 0.4.2 2023-05-02 [1] CRAN (R 4.5.1)
visdat 0.6.0 2023-02-02 [1] CRAN (R 4.5.1)
vroom 1.6.6 2025-09-19 [1] CRAN (R 4.5.1)
withr 3.0.2 2024-10-28 [1] CRAN (R 4.5.1)
xfun 0.52 2025-04-02 [1] CRAN (R 4.5.1)
xml2 1.4.0 2025-08-20 [1] CRAN (R 4.5.1)
yaml 2.3.10 2024-07-26 [1] CRAN (R 4.5.0)
[1] C:/Users/albal/AppData/Local/R/win-library/4.5
[2] C:/Program Files/R/R-4.5.1/library
* ── Packages attached to the search path.
──────────────────────────────────────────────────────────────────────────────