RQ2

Author

Diane

Published

2025-12-05

Research Question 1: What predictors can help determine whether an ER visit results in inpatient admission?

Library Download

library(tidymodels)
Warning: package 'tidymodels' was built under R version 4.5.2
── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──
✔ broom        1.0.10     ✔ recipes      1.3.1 
✔ dials        1.4.2      ✔ rsample      1.3.1 
✔ dplyr        1.1.4      ✔ tailor       0.1.0 
✔ ggplot2      4.0.1      ✔ tidyr        1.3.1 
✔ infer        1.0.9      ✔ tune         2.0.1 
✔ modeldata    1.5.1      ✔ workflows    1.3.0 
✔ parsnip      1.3.3      ✔ workflowsets 1.1.1 
✔ purrr        1.1.0      ✔ yardstick    1.3.2 
Warning: package 'dials' was built under R version 4.5.2
Warning: package 'dplyr' was built under R version 4.5.2
Warning: package 'ggplot2' was built under R version 4.5.2
Warning: package 'infer' was built under R version 4.5.2
Warning: package 'modeldata' was built under R version 4.5.2
Warning: package 'parsnip' was built under R version 4.5.2
Warning: package 'recipes' was built under R version 4.5.2
Warning: package 'rsample' was built under R version 4.5.2
Warning: package 'tailor' was built under R version 4.5.2
Warning: package 'tune' was built under R version 4.5.2
Warning: package 'workflows' was built under R version 4.5.2
Warning: package 'workflowsets' was built under R version 4.5.2
Warning: package 'yardstick' was built under R version 4.5.2
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ purrr::discard() masks scales::discard()
✖ dplyr::filter()  masks stats::filter()
✖ dplyr::lag()     masks stats::lag()
✖ recipes::step()  masks stats::step()
library(glmnet)
Warning: package 'glmnet' was built under R version 4.5.2
Loading required package: Matrix

Attaching package: 'Matrix'
The following objects are masked from 'package:tidyr':

    expand, pack, unpack
Loaded glmnet 4.1-10
library(xgboost)
Warning: package 'xgboost' was built under R version 4.5.2

Attaching package: 'xgboost'
The following object is masked from 'package:dplyr':

    slice
library(vip)

Attaching package: 'vip'
The following object is masked from 'package:utils':

    vi
library(haven)
Warning: package 'haven' was built under R version 4.5.2
library(ranger)
Warning: package 'ranger' was built under R version 4.5.2
library(randomForest)
Warning: package 'randomForest' was built under R version 4.5.2
randomForest 4.7-1.2
Type rfNews() to see new features/changes/bug fixes.

Attaching package: 'randomForest'
The following object is masked from 'package:ranger':

    importance
The following object is masked from 'package:ggplot2':

    margin
The following object is masked from 'package:dplyr':

    combine
library(tree)
Warning: package 'tree' was built under R version 4.5.2
library(ISLR)
Warning: package 'ISLR' was built under R version 4.5.2
library(dplyr)
library (plotly)

Attaching package: 'plotly'
The following object is masked from 'package:xgboost':

    slice
The following object is masked from 'package:ggplot2':

    last_plot
The following object is masked from 'package:stats':

    filter
The following object is masked from 'package:graphics':

    layout
library (car)
Warning: package 'car' was built under R version 4.5.2
Loading required package: carData
Warning: package 'carData' was built under R version 4.5.2

Attaching package: 'car'
The following object is masked from 'package:purrr':

    some
The following object is masked from 'package:dplyr':

    recode

Research Questions: 2. Which type of ER visits are the costliest for total ER expenditures? Types of tests / diagnostics? *note: Total ER expenditure is different than out of pocket

OGData= read.csv("C:/Users/diane/downloads/h248e.csv")

Looking for Missing Values -1,-7,-8 shows missing / dont know vairbales completely remove varibles with these negative values - this dropped 317 observations

Renaming Variables

OGData_clean <- OGData %>%
  # ── Keep only the variables we actually use across all 3 RQs ──
dplyr:: select(
    # Person-level ID and survey weight
    person_id   = DUPERSID,
    weight      = PERWT23F,
    
    # ── RQ1: Predict hospital admission ──
    admitted    = ERHEVIDX,        # will recode below
    
    # Tests & diagnostics performed in ER (all yes/no → 0/1)
    mri_ct      = MRI_M18,
    surgery     = SURGPROC,
    xray        = XRAYS_M18,
    lab_tests   = LABTEST_M18,
    ekg         = EKG_M18,
    ultrasound  = SONOGRAM_M18,
    mammogram   = MAMMOG_M18,
    vaccination = RCVVAC_M18,
    rx_given    = MEDPRESC,
    related_condition = VSTRELCN,
    
    # ── RQ2: Total ER cost drivers ──
    total_cost  = ERXP23X,         # total expenditure (facility + doctor)
    
    # ── RQ3: Out-of-pocket by insurance ──
    oop_facility = ERFSF23X,       # self/family OOP facility
    oop_doctor   = ERDSF23X,       # self/family OOP doctor
    
    private_fac  = ERFPV23X,
    private_doc  = ERDPV23X,
    medicaid_fac = ERFMD23X,
    medicaid_doc = ERDMD23X,
    medicare_fac = ERFMR23X,
    medicare_doc = ERDMR23X
  )

Adding in Insurance Information

#Cleaning Data to find Insurance v No Insurance
OGData_clean <- OGData_clean %>%
  mutate(Insurance = case_when(
    (private_fac > 0  |  private_doc > 0 | medicaid_fac > 0 | medicaid_doc > 0 | medicare_fac > 0 | medicare_doc > 0) ~ "1",
    TRUE ~ "0"
  ))

OGData_clean$Insurance <- as.numeric(OGData_clean$Insurance)

Cleaning up binary data Clean More Data - Deal with 95 & 1&2 to 0,1 remove -8 dont know variables 1 is yes 2 is no - changing to 0,1 - no - 0 yes 1

yesno_vars <- c(
  "mri_ct", "surgery", "xray", "lab_tests", "ekg",
  "ultrasound", "mammogram", "vaccination", "rx_given",
  "related_condition")

OGData_clean <- OGData_clean %>%
  mutate(across(all_of(yesno_vars), ~ ifelse(. == 1, 1, 0))) #keeps all the 1's and replaces the rest with 0

#cleaning admitted inpatient for RQ1
OGData_clean$admitted <- ifelse(OGData_clean$admitted == "-1", 0, 1) #-1 not admitted - changing to 0;; recoding 1 to be admitted
table(OGData_clean$admitted) #not admitted 3345; admitted:896

   0    1 
3345  896 

Data Visualization for RQ1 - Skewed Data

#creating new dataset to recode Admitted ad MRI CT to Yes / no
RQ1Viz <- OGData_clean %>%
mutate(admitted = factor(admitted, levels = c(0,1), labels = c("No","Yes")),
        mri_ct = factor(mri_ct, levels = c(0,1), labels = c("No","Yes")))


ggplotly (ggplot(RQ1Viz, aes(x = factor(admitted), fill = factor(admitted))) +
  geom_bar(position = "dodge") +
  labs(
    title = "Count of ER Visits by In Patient Admission Status",
    x = "In Patient Admitted",
    y = "Count",
    fill = "Admitted"
  ) +
  theme_minimal())
ggplotly(ggplot(RQ1Viz, aes(x = factor(admitted), fill = factor(mri_ct))) +
  geom_bar(position = "dodge") +
  labs(
    title = "Hospital Admission by Whether MRI CT Was Performed",
    x = "In Patient Admitted",
    y = "Count",
    fill = "MRI CT"
  ) +
  geom_bar(position = "stack")+
  theme_minimal())
ggplotly (ggplot(OGData_clean, aes(x = factor(admitted), fill = factor(surgery))) +
  geom_bar(position = "dodge") +
  labs(
    title = "Hospital Admission by Whether Surgery Was Performed",
    x = "Admitted (0 = No, 1 = Yes)",
    y = "Count",
    fill = "Surgery (0 = No, 1 = Yes)"
  ) +
  geom_bar(position = "stack")+
  theme_minimal())

RQ 1 1. What predictors can help determine whether an ER visit results in an inpatient admission? Explore a number of predictors such as type of tests, diagnostics, and reason for visit

he confusion matrix, and sensitivity/recall for the “admitted” class (how many admitted patients your model correctly identifies).

ADD VARIABLE SELECTION METHOD BEFORE THIS****

RQ1Data <- OGData_clean %>%
  dplyr:: select(admitted,        
    mri_ct      ,
    surgery,
    xray ,
    lab_tests ,
    ekg    ,
    ultrasound ,
    mammogram ,
    vaccination ,
    rx_given ,
    related_condition,
    Insurance)

Train & Test

library(caret)
Warning: package 'caret' was built under R version 4.5.2
Loading required package: lattice

Attaching package: 'caret'
The following objects are masked from 'package:yardstick':

    precision, recall, sensitivity, specificity
The following object is masked from 'package:rsample':

    calibration
The following object is masked from 'package:purrr':

    lift
library(car)


set.seed(1)
train_index = createDataPartition(RQ1Data$admitted, p = 0.8, list = FALSE) #createdatapartition ensures that admitted / not admitted are acurately represented in both train and test set*
train_data = RQ1Data[train_index, ]
test_data = RQ1Data[-train_index, ]

Logistic Regression

glm_model <- glm(admitted ~ ., data = RQ1Data, family = "binomial")
vif(glm_model) #no multicolinerarity
           mri_ct           surgery              xray         lab_tests 
         1.060226          1.036254          1.058423          1.150643 
              ekg        ultrasound         mammogram       vaccination 
         1.161988          1.026398          1.006616          1.013955 
         rx_given related_condition         Insurance 
         1.032644          1.012979          1.059200 
#Train Data to predict admitted with Logistic Regression
glm.admitted=glm(admitted~.,data=train_data,family="binomial") #the . after ~ shows all variables
summary (glm.admitted)

Call:
glm(formula = admitted ~ ., family = "binomial", data = train_data)

Coefficients:
                   Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -1.09992    0.19147  -5.744 9.22e-09 ***
mri_ct              0.42790    0.10906   3.924 8.72e-05 ***
surgery             0.89785    0.18537   4.844 1.28e-06 ***
xray               -0.03592    0.10341  -0.347    0.728    
lab_tests           0.82450    0.10913   7.555 4.19e-14 ***
ekg                 0.62813    0.11301   5.558 2.72e-08 ***
ultrasound          0.54345    0.13630   3.987 6.69e-05 ***
mammogram         -13.87172  238.18570  -0.058    0.954    
vaccination        -0.18130    0.48603  -0.373    0.709    
rx_given           -1.30270    0.13127  -9.924  < 2e-16 ***
related_condition   0.96239    0.17494   5.501 3.77e-08 ***
Insurance          -2.13319    0.11567 -18.442  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 3517.9  on 3392  degrees of freedom
Residual deviance: 2758.8  on 3381  degrees of freedom
AIC: 2782.8

Number of Fisher Scoring iterations: 13
# Make predictions on the test set
predictionsLR = predict(glm.admitted, newdata = test_data, type = "response")

# Convert to class labels
LR_pred <- factor(ifelse(predictionsLR > 0.5, 1, 0), levels = c(0, 1))
LR_truth <- factor(test_data$admitted, levels = c(0, 1))

# Confusion matrix
LR_cm <- confusionMatrix(LR_pred, LR_truth) #81.9% accuracy
LR_cm
Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 647 124
         1  29  48
                                         
               Accuracy : 0.8196         
                 95% CI : (0.792, 0.8449)
    No Information Rate : 0.7972         
    P-Value [Acc > NIR] : 0.05553        
                                         
                  Kappa : 0.2974         
                                         
 Mcnemar's Test P-Value : 2.974e-14      
                                         
            Sensitivity : 0.9571         
            Specificity : 0.2791         
         Pos Pred Value : 0.8392         
         Neg Pred Value : 0.6234         
             Prevalence : 0.7972         
         Detection Rate : 0.7630         
   Detection Prevalence : 0.9092         
      Balanced Accuracy : 0.6181         
                                         
       'Positive' Class : 0              
                                         
#MSE
mean((predictionsLR - test_data$admitted)^2) # 0.129
[1] 0.1289928

Random Forest RQ1 - Classification Handles skewed data better

library(randomForest)
library(caret)

# Train a Random Forest model to predict admitted
rf_model = randomForest(as.factor(admitted) ~ ., data = train_data, ntree = 500)
varImpPlot(rf_model)

rf_model

Call:
 randomForest(formula = as.factor(admitted) ~ ., data = train_data,      ntree = 500) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 3

        OOB estimate of  error rate: 18.16%
Confusion matrix:
     0   1 class.error
0 2585  84  0.03147246
1  532 192  0.73480663
(plot(rf_model))

             OOB          0         1
  [1,] 0.1835906 0.05648536 0.6254545
  [2,] 0.1907371 0.05690537 0.6621622
  [3,] 0.1786423 0.05005056 0.6487985
  [4,] 0.1818823 0.04993252 0.6596091
  [5,] 0.1817882 0.05296343 0.6518405
  [6,] 0.1787180 0.05060241 0.6499261
  [7,] 0.1840321 0.05492350 0.6575540
  [8,] 0.1801148 0.05184332 0.6539007
  [9,] 0.1821996 0.05074399 0.6634078
 [10,] 0.1817640 0.05273141 0.6541667
 [11,] 0.1827893 0.05134013 0.6657420
 [12,] 0.1818720 0.04975499 0.6666667
 [13,] 0.1794796 0.04701015 0.6666667
 [14,] 0.1806909 0.04656403 0.6740331
 [15,] 0.1802892 0.04615385 0.6740331
 [16,] 0.1798349 0.04572714 0.6740331
 [17,] 0.1794872 0.04383664 0.6795580
 [18,] 0.1788977 0.04308730 0.6795580
 [19,] 0.1791925 0.04271263 0.6823204
 [20,] 0.1786030 0.04383664 0.6754144
 [21,] 0.1803714 0.04496066 0.6795580
 [22,] 0.1783083 0.04308730 0.6767956
 [23,] 0.1788977 0.04046459 0.6892265
 [24,] 0.1806661 0.04121394 0.6947514
 [25,] 0.1786030 0.03821656 0.6961326
 [26,] 0.1806661 0.03671787 0.7113260
 [27,] 0.1791925 0.03709254 0.7030387
 [28,] 0.1771294 0.03484451 0.7016575
 [29,] 0.1777188 0.03521918 0.7030387
 [30,] 0.1794872 0.03821656 0.7002762
 [31,] 0.1780136 0.03671787 0.6988950
 [32,] 0.1800766 0.03784189 0.7044199
 [33,] 0.1783083 0.03521918 0.7058011
 [34,] 0.1797819 0.03596853 0.7099448
 [35,] 0.1786030 0.03484451 0.7085635
 [36,] 0.1788977 0.03409517 0.7127072
 [37,] 0.1791925 0.03446984 0.7127072
 [38,] 0.1788977 0.03372049 0.7140884
 [39,] 0.1786030 0.03297115 0.7154696
 [40,] 0.1791925 0.03409517 0.7140884
 [41,] 0.1791925 0.03372049 0.7154696
 [42,] 0.1800766 0.03372049 0.7196133
 [43,] 0.1803714 0.03259648 0.7251381
 [44,] 0.1809608 0.03222181 0.7292818
 [45,] 0.1818450 0.03297115 0.7306630
 [46,] 0.1815503 0.03222181 0.7320442
 [47,] 0.1800766 0.03184713 0.7265193
 [48,] 0.1815503 0.03222181 0.7320442
 [49,] 0.1812555 0.03109779 0.7348066
 [50,] 0.1818450 0.03259648 0.7320442
 [51,] 0.1806661 0.03109779 0.7320442
 [52,] 0.1806661 0.02997377 0.7361878
 [53,] 0.1818450 0.03222181 0.7334254
 [54,] 0.1806661 0.03072312 0.7334254
 [55,] 0.1812555 0.03109779 0.7348066
 [56,] 0.1812555 0.03184713 0.7320442
 [57,] 0.1812555 0.03222181 0.7306630
 [58,] 0.1818450 0.03259648 0.7320442
 [59,] 0.1830239 0.03297115 0.7361878
 [60,] 0.1821397 0.03297115 0.7320442
 [61,] 0.1812555 0.03147246 0.7334254
 [62,] 0.1809608 0.03072312 0.7348066
 [63,] 0.1803714 0.03034845 0.7334254
 [64,] 0.1794872 0.02959910 0.7320442
 [65,] 0.1809608 0.03034845 0.7361878
 [66,] 0.1806661 0.03109779 0.7320442
 [67,] 0.1803714 0.02997377 0.7348066
 [68,] 0.1806661 0.03034845 0.7348066
 [69,] 0.1806661 0.03034845 0.7348066
 [70,] 0.1812555 0.03109779 0.7348066
 [71,] 0.1797819 0.02959910 0.7334254
 [72,] 0.1803714 0.02997377 0.7348066
 [73,] 0.1806661 0.03072312 0.7334254
 [74,] 0.1803714 0.03034845 0.7334254
 [75,] 0.1818450 0.03072312 0.7389503
 [76,] 0.1815503 0.03034845 0.7389503
 [77,] 0.1806661 0.03184713 0.7292818
 [78,] 0.1809608 0.03184713 0.7306630
 [79,] 0.1812555 0.03072312 0.7361878
 [80,] 0.1809608 0.03184713 0.7306630
 [81,] 0.1824344 0.03297115 0.7334254
 [82,] 0.1824344 0.03334582 0.7320442
 [83,] 0.1821397 0.03409517 0.7279006
 [84,] 0.1830239 0.03372049 0.7334254
 [85,] 0.1812555 0.03184713 0.7320442
 [86,] 0.1809608 0.03147246 0.7320442
 [87,] 0.1809608 0.03147246 0.7320442
 [88,] 0.1812555 0.03072312 0.7361878
 [89,] 0.1821397 0.03259648 0.7334254
 [90,] 0.1818450 0.03259648 0.7320442
 [91,] 0.1824344 0.03259648 0.7348066
 [92,] 0.1821397 0.03259648 0.7334254
 [93,] 0.1830239 0.03409517 0.7320442
 [94,] 0.1830239 0.03446984 0.7306630
 [95,] 0.1821397 0.03372049 0.7292818
 [96,] 0.1818450 0.03484451 0.7237569
 [97,] 0.1824344 0.03559386 0.7237569
 [98,] 0.1818450 0.03409517 0.7265193
 [99,] 0.1818450 0.03484451 0.7237569
[100,] 0.1821397 0.03372049 0.7292818
[101,] 0.1824344 0.03446984 0.7279006
[102,] 0.1815503 0.03372049 0.7265193
[103,] 0.1803714 0.03184713 0.7279006
[104,] 0.1815503 0.03334582 0.7279006
[105,] 0.1818450 0.03334582 0.7292818
[106,] 0.1812555 0.03222181 0.7306630
[107,] 0.1815503 0.03259648 0.7306630
[108,] 0.1821397 0.03259648 0.7334254
[109,] 0.1824344 0.03334582 0.7320442
[110,] 0.1818450 0.03222181 0.7334254
[111,] 0.1821397 0.03297115 0.7320442
[112,] 0.1824344 0.03222181 0.7361878
[113,] 0.1821397 0.03222181 0.7348066
[114,] 0.1821397 0.03184713 0.7361878
[115,] 0.1818450 0.03184713 0.7348066
[116,] 0.1818450 0.03184713 0.7348066
[117,] 0.1818450 0.03259648 0.7320442
[118,] 0.1827291 0.03334582 0.7334254
[119,] 0.1818450 0.03259648 0.7320442
[120,] 0.1827291 0.03334582 0.7334254
[121,] 0.1830239 0.03372049 0.7334254
[122,] 0.1821397 0.03334582 0.7306630
[123,] 0.1821397 0.03334582 0.7306630
[124,] 0.1821397 0.03259648 0.7334254
[125,] 0.1821397 0.03184713 0.7361878
[126,] 0.1809608 0.03147246 0.7320442
[127,] 0.1818450 0.03147246 0.7361878
[128,] 0.1824344 0.03222181 0.7361878
[129,] 0.1827291 0.03259648 0.7361878
[130,] 0.1824344 0.03259648 0.7348066
[131,] 0.1818450 0.03222181 0.7334254
[132,] 0.1827291 0.03297115 0.7348066
[133,] 0.1830239 0.03409517 0.7320442
[134,] 0.1842028 0.03446984 0.7361878
[135,] 0.1842028 0.03559386 0.7320442
[136,] 0.1830239 0.03484451 0.7292818
[137,] 0.1836133 0.03521918 0.7306630
[138,] 0.1830239 0.03446984 0.7306630
[139,] 0.1836133 0.03484451 0.7320442
[140,] 0.1827291 0.03409517 0.7306630
[141,] 0.1824344 0.03372049 0.7306630
[142,] 0.1824344 0.03297115 0.7334254
[143,] 0.1821397 0.03297115 0.7320442
[144,] 0.1821397 0.03297115 0.7320442
[145,] 0.1824344 0.03297115 0.7334254
[146,] 0.1824344 0.03259648 0.7348066
[147,] 0.1821397 0.03259648 0.7334254
[148,] 0.1827291 0.03334582 0.7334254
[149,] 0.1830239 0.03372049 0.7334254
[150,] 0.1821397 0.03259648 0.7334254
[151,] 0.1830239 0.03334582 0.7348066
[152,] 0.1824344 0.03259648 0.7348066
[153,] 0.1824344 0.03259648 0.7348066
[154,] 0.1827291 0.03334582 0.7334254
[155,] 0.1815503 0.03297115 0.7292818
[156,] 0.1821397 0.03372049 0.7292818
[157,] 0.1827291 0.03409517 0.7306630
[158,] 0.1815503 0.03334582 0.7279006
[159,] 0.1821397 0.03297115 0.7320442
[160,] 0.1815503 0.03259648 0.7306630
[161,] 0.1818450 0.03297115 0.7306630
[162,] 0.1815503 0.03259648 0.7306630
[163,] 0.1818450 0.03297115 0.7306630
[164,] 0.1821397 0.03222181 0.7348066
[165,] 0.1821397 0.03222181 0.7348066
[166,] 0.1809608 0.03072312 0.7348066
[167,] 0.1815503 0.03184713 0.7334254
[168,] 0.1815503 0.03259648 0.7306630
[169,] 0.1818450 0.03259648 0.7320442
[170,] 0.1818450 0.03297115 0.7306630
[171,] 0.1818450 0.03222181 0.7334254
[172,] 0.1812555 0.03184713 0.7320442
[173,] 0.1806661 0.03109779 0.7320442
[174,] 0.1815503 0.03184713 0.7334254
[175,] 0.1806661 0.03072312 0.7334254
[176,] 0.1812555 0.03147246 0.7334254
[177,] 0.1803714 0.02997377 0.7348066
[178,] 0.1806661 0.03034845 0.7348066
[179,] 0.1806661 0.03109779 0.7320442
[180,] 0.1803714 0.03109779 0.7306630
[181,] 0.1800766 0.03034845 0.7320442
[182,] 0.1806661 0.03034845 0.7348066
[183,] 0.1803714 0.03034845 0.7334254
[184,] 0.1815503 0.03034845 0.7389503
[185,] 0.1803714 0.02922443 0.7375691
[186,] 0.1800766 0.02959910 0.7348066
[187,] 0.1809608 0.02997377 0.7375691
[188,] 0.1803714 0.02959910 0.7361878
[189,] 0.1803714 0.02959910 0.7361878
[190,] 0.1800766 0.02922443 0.7361878
[191,] 0.1797819 0.02884976 0.7361878
[192,] 0.1803714 0.02959910 0.7361878
[193,] 0.1803714 0.02997377 0.7348066
[194,] 0.1797819 0.02959910 0.7334254
[195,] 0.1800766 0.02922443 0.7361878
[196,] 0.1803714 0.02922443 0.7375691
[197,] 0.1812555 0.02922443 0.7417127
[198,] 0.1806661 0.02922443 0.7389503
[199,] 0.1806661 0.02922443 0.7389503
[200,] 0.1803714 0.02884976 0.7389503
[201,] 0.1800766 0.02847508 0.7389503
[202,] 0.1803714 0.02847508 0.7403315
[203,] 0.1803714 0.02884976 0.7389503
[204,] 0.1800766 0.02847508 0.7389503
[205,] 0.1800766 0.02847508 0.7389503
[206,] 0.1797819 0.02772574 0.7403315
[207,] 0.1797819 0.02772574 0.7403315
[208,] 0.1800766 0.02735107 0.7430939
[209,] 0.1797819 0.02735107 0.7417127
[210,] 0.1794872 0.02735107 0.7403315
[211,] 0.1797819 0.02735107 0.7417127
[212,] 0.1800766 0.02772574 0.7417127
[213,] 0.1809608 0.02847508 0.7430939
[214,] 0.1812555 0.02922443 0.7417127
[215,] 0.1815503 0.02959910 0.7417127
[216,] 0.1809608 0.02884976 0.7417127
[217,] 0.1803714 0.02959910 0.7361878
[218,] 0.1800766 0.02884976 0.7375691
[219,] 0.1800766 0.02884976 0.7375691
[220,] 0.1803714 0.02959910 0.7361878
[221,] 0.1797819 0.02847508 0.7375691
[222,] 0.1800766 0.02884976 0.7375691
[223,] 0.1800766 0.02884976 0.7375691
[224,] 0.1803714 0.02922443 0.7375691
[225,] 0.1806661 0.02959910 0.7375691
[226,] 0.1806661 0.02959910 0.7375691
[227,] 0.1809608 0.02997377 0.7375691
[228,] 0.1800766 0.02884976 0.7375691
[229,] 0.1803714 0.02959910 0.7361878
[230,] 0.1815503 0.02997377 0.7403315
[231,] 0.1809608 0.02922443 0.7403315
[232,] 0.1812555 0.02959910 0.7403315
[233,] 0.1812555 0.02959910 0.7403315
[234,] 0.1812555 0.02997377 0.7389503
[235,] 0.1815503 0.03034845 0.7389503
[236,] 0.1803714 0.02959910 0.7361878
[237,] 0.1809608 0.02997377 0.7375691
[238,] 0.1812555 0.02997377 0.7389503
[239,] 0.1809608 0.03034845 0.7361878
[240,] 0.1809608 0.03072312 0.7348066
[241,] 0.1815503 0.03109779 0.7361878
[242,] 0.1806661 0.03072312 0.7334254
[243,] 0.1815503 0.03072312 0.7375691
[244,] 0.1800766 0.02922443 0.7361878
[245,] 0.1806661 0.02997377 0.7361878
[246,] 0.1815503 0.03072312 0.7375691
[247,] 0.1806661 0.02997377 0.7361878
[248,] 0.1806661 0.02959910 0.7375691
[249,] 0.1806661 0.02959910 0.7375691
[250,] 0.1812555 0.02997377 0.7389503
[251,] 0.1806661 0.02922443 0.7389503
[252,] 0.1815503 0.02997377 0.7403315
[253,] 0.1815503 0.03034845 0.7389503
[254,] 0.1812555 0.02959910 0.7403315
[255,] 0.1812555 0.02959910 0.7403315
[256,] 0.1809608 0.02959910 0.7389503
[257,] 0.1815503 0.02997377 0.7403315
[258,] 0.1815503 0.02997377 0.7403315
[259,] 0.1809608 0.02959910 0.7389503
[260,] 0.1803714 0.02922443 0.7375691
[261,] 0.1806661 0.02922443 0.7389503
[262,] 0.1812555 0.02959910 0.7403315
[263,] 0.1806661 0.02922443 0.7389503
[264,] 0.1797819 0.02810041 0.7389503
[265,] 0.1803714 0.02884976 0.7389503
[266,] 0.1806661 0.02959910 0.7375691
[267,] 0.1806661 0.02922443 0.7389503
[268,] 0.1803714 0.02884976 0.7389503
[269,] 0.1800766 0.02847508 0.7389503
[270,] 0.1803714 0.02922443 0.7375691
[271,] 0.1800766 0.02884976 0.7375691
[272,] 0.1818450 0.02997377 0.7417127
[273,] 0.1821397 0.02997377 0.7430939
[274,] 0.1818450 0.02997377 0.7417127
[275,] 0.1812555 0.02997377 0.7389503
[276,] 0.1815503 0.02997377 0.7403315
[277,] 0.1818450 0.02997377 0.7417127
[278,] 0.1821397 0.03034845 0.7417127
[279,] 0.1818450 0.03034845 0.7403315
[280,] 0.1818450 0.02997377 0.7417127
[281,] 0.1812555 0.02997377 0.7389503
[282,] 0.1812555 0.02997377 0.7389503
[283,] 0.1809608 0.03034845 0.7361878
[284,] 0.1812555 0.03034845 0.7375691
[285,] 0.1809608 0.03034845 0.7361878
[286,] 0.1806661 0.03034845 0.7348066
[287,] 0.1809608 0.03034845 0.7361878
[288,] 0.1803714 0.02997377 0.7348066
[289,] 0.1800766 0.02997377 0.7334254
[290,] 0.1803714 0.02997377 0.7348066
[291,] 0.1803714 0.03034845 0.7334254
[292,] 0.1800766 0.03034845 0.7320442
[293,] 0.1797819 0.03072312 0.7292818
[294,] 0.1803714 0.02997377 0.7348066
[295,] 0.1794872 0.02997377 0.7306630
[296,] 0.1800766 0.03072312 0.7306630
[297,] 0.1800766 0.03072312 0.7306630
[298,] 0.1794872 0.03072312 0.7279006
[299,] 0.1797819 0.03147246 0.7265193
[300,] 0.1791925 0.03109779 0.7251381
[301,] 0.1797819 0.03109779 0.7279006
[302,] 0.1794872 0.03072312 0.7279006
[303,] 0.1791925 0.03072312 0.7265193
[304,] 0.1791925 0.03034845 0.7279006
[305,] 0.1794872 0.03034845 0.7292818
[306,] 0.1794872 0.03072312 0.7279006
[307,] 0.1786030 0.02997377 0.7265193
[308,] 0.1786030 0.03034845 0.7251381
[309,] 0.1788977 0.03034845 0.7265193
[310,] 0.1791925 0.03072312 0.7265193
[311,] 0.1797819 0.03109779 0.7279006
[312,] 0.1797819 0.03147246 0.7265193
[313,] 0.1794872 0.03109779 0.7265193
[314,] 0.1803714 0.03184713 0.7279006
[315,] 0.1797819 0.03147246 0.7265193
[316,] 0.1803714 0.03184713 0.7279006
[317,] 0.1803714 0.03184713 0.7279006
[318,] 0.1797819 0.03147246 0.7265193
[319,] 0.1800766 0.03147246 0.7279006
[320,] 0.1794872 0.03147246 0.7251381
[321,] 0.1797819 0.03147246 0.7265193
[322,] 0.1797819 0.03109779 0.7279006
[323,] 0.1797819 0.03109779 0.7279006
[324,] 0.1800766 0.03109779 0.7292818
[325,] 0.1797819 0.03109779 0.7279006
[326,] 0.1791925 0.03109779 0.7251381
[327,] 0.1797819 0.03109779 0.7279006
[328,] 0.1800766 0.03147246 0.7279006
[329,] 0.1794872 0.03147246 0.7251381
[330,] 0.1800766 0.03184713 0.7265193
[331,] 0.1797819 0.03147246 0.7265193
[332,] 0.1791925 0.03109779 0.7251381
[333,] 0.1794872 0.03109779 0.7265193
[334,] 0.1797819 0.03147246 0.7265193
[335,] 0.1797819 0.03147246 0.7265193
[336,] 0.1800766 0.03109779 0.7292818
[337,] 0.1797819 0.03147246 0.7265193
[338,] 0.1797819 0.03147246 0.7265193
[339,] 0.1800766 0.03184713 0.7265193
[340,] 0.1797819 0.03184713 0.7251381
[341,] 0.1794872 0.03147246 0.7251381
[342,] 0.1791925 0.03072312 0.7265193
[343,] 0.1788977 0.03034845 0.7265193
[344,] 0.1791925 0.03034845 0.7279006
[345,] 0.1797819 0.03072312 0.7292818
[346,] 0.1794872 0.03109779 0.7265193
[347,] 0.1797819 0.03109779 0.7279006
[348,] 0.1794872 0.03072312 0.7279006
[349,] 0.1794872 0.03109779 0.7265193
[350,] 0.1800766 0.03147246 0.7279006
[351,] 0.1800766 0.03147246 0.7279006
[352,] 0.1806661 0.03184713 0.7292818
[353,] 0.1806661 0.03147246 0.7306630
[354,] 0.1803714 0.03147246 0.7292818
[355,] 0.1803714 0.03109779 0.7306630
[356,] 0.1803714 0.03147246 0.7292818
[357,] 0.1806661 0.03147246 0.7306630
[358,] 0.1803714 0.03109779 0.7306630
[359,] 0.1812555 0.03222181 0.7306630
[360,] 0.1803714 0.03147246 0.7292818
[361,] 0.1806661 0.03147246 0.7306630
[362,] 0.1803714 0.03109779 0.7306630
[363,] 0.1809608 0.03147246 0.7320442
[364,] 0.1809608 0.03147246 0.7320442
[365,] 0.1806661 0.03147246 0.7306630
[366,] 0.1803714 0.03147246 0.7292818
[367,] 0.1800766 0.03147246 0.7279006
[368,] 0.1803714 0.03147246 0.7292818
[369,] 0.1800766 0.03109779 0.7292818
[370,] 0.1806661 0.03147246 0.7306630
[371,] 0.1806661 0.03147246 0.7306630
[372,] 0.1806661 0.03147246 0.7306630
[373,] 0.1803714 0.03109779 0.7306630
[374,] 0.1803714 0.03072312 0.7320442
[375,] 0.1800766 0.03072312 0.7306630
[376,] 0.1806661 0.03109779 0.7320442
[377,] 0.1803714 0.03109779 0.7306630
[378,] 0.1803714 0.03109779 0.7306630
[379,] 0.1806661 0.03147246 0.7306630
[380,] 0.1809608 0.03184713 0.7306630
[381,] 0.1806661 0.03147246 0.7306630
[382,] 0.1803714 0.03109779 0.7306630
[383,] 0.1797819 0.03034845 0.7306630
[384,] 0.1800766 0.03034845 0.7320442
[385,] 0.1800766 0.03072312 0.7306630
[386,] 0.1803714 0.03109779 0.7306630
[387,] 0.1803714 0.03072312 0.7320442
[388,] 0.1809608 0.03147246 0.7320442
[389,] 0.1806661 0.03109779 0.7320442
[390,] 0.1809608 0.03109779 0.7334254
[391,] 0.1800766 0.03109779 0.7292818
[392,] 0.1806661 0.03147246 0.7306630
[393,] 0.1806661 0.03147246 0.7306630
[394,] 0.1809608 0.03147246 0.7320442
[395,] 0.1815503 0.03147246 0.7348066
[396,] 0.1815503 0.03184713 0.7334254
[397,] 0.1812555 0.03184713 0.7320442
[398,] 0.1809608 0.03147246 0.7320442
[399,] 0.1809608 0.03147246 0.7320442
[400,] 0.1812555 0.03147246 0.7334254
[401,] 0.1809608 0.03147246 0.7320442
[402,] 0.1818450 0.03184713 0.7348066
[403,] 0.1818450 0.03184713 0.7348066
[404,] 0.1812555 0.03147246 0.7334254
[405,] 0.1815503 0.03184713 0.7334254
[406,] 0.1818450 0.03147246 0.7361878
[407,] 0.1815503 0.03184713 0.7334254
[408,] 0.1815503 0.03184713 0.7334254
[409,] 0.1815503 0.03184713 0.7334254
[410,] 0.1821397 0.03222181 0.7348066
[411,] 0.1824344 0.03259648 0.7348066
[412,] 0.1821397 0.03259648 0.7334254
[413,] 0.1812555 0.03222181 0.7306630
[414,] 0.1821397 0.03259648 0.7334254
[415,] 0.1821397 0.03297115 0.7320442
[416,] 0.1818450 0.03259648 0.7320442
[417,] 0.1818450 0.03259648 0.7320442
[418,] 0.1815503 0.03259648 0.7306630
[419,] 0.1818450 0.03259648 0.7320442
[420,] 0.1818450 0.03259648 0.7320442
[421,] 0.1821397 0.03334582 0.7306630
[422,] 0.1818450 0.03297115 0.7306630
[423,] 0.1824344 0.03372049 0.7306630
[424,] 0.1818450 0.03297115 0.7306630
[425,] 0.1824344 0.03334582 0.7320442
[426,] 0.1818450 0.03259648 0.7320442
[427,] 0.1815503 0.03222181 0.7320442
[428,] 0.1815503 0.03259648 0.7306630
[429,] 0.1818450 0.03334582 0.7292818
[430,] 0.1818450 0.03297115 0.7306630
[431,] 0.1818450 0.03297115 0.7306630
[432,] 0.1809608 0.03184713 0.7306630
[433,] 0.1809608 0.03184713 0.7306630
[434,] 0.1818450 0.03259648 0.7320442
[435,] 0.1821397 0.03334582 0.7306630
[436,] 0.1815503 0.03222181 0.7320442
[437,] 0.1821397 0.03297115 0.7320442
[438,] 0.1821397 0.03259648 0.7334254
[439,] 0.1818450 0.03184713 0.7348066
[440,] 0.1818450 0.03222181 0.7334254
[441,] 0.1815503 0.03222181 0.7320442
[442,] 0.1821397 0.03297115 0.7320442
[443,] 0.1824344 0.03334582 0.7320442
[444,] 0.1827291 0.03334582 0.7334254
[445,] 0.1818450 0.03222181 0.7334254
[446,] 0.1818450 0.03222181 0.7334254
[447,] 0.1818450 0.03222181 0.7334254
[448,] 0.1821397 0.03222181 0.7348066
[449,] 0.1821397 0.03222181 0.7348066
[450,] 0.1827291 0.03297115 0.7348066
[451,] 0.1827291 0.03297115 0.7348066
[452,] 0.1827291 0.03259648 0.7361878
[453,] 0.1830239 0.03297115 0.7361878
[454,] 0.1827291 0.03259648 0.7361878
[455,] 0.1827291 0.03222181 0.7375691
[456,] 0.1827291 0.03184713 0.7389503
[457,] 0.1827291 0.03184713 0.7389503
[458,] 0.1824344 0.03147246 0.7389503
[459,] 0.1824344 0.03147246 0.7389503
[460,] 0.1821397 0.03147246 0.7375691
[461,] 0.1821397 0.03147246 0.7375691
[462,] 0.1821397 0.03147246 0.7375691
[463,] 0.1821397 0.03147246 0.7375691
[464,] 0.1815503 0.03147246 0.7348066
[465,] 0.1821397 0.03184713 0.7361878
[466,] 0.1818450 0.03147246 0.7361878
[467,] 0.1824344 0.03184713 0.7375691
[468,] 0.1821397 0.03147246 0.7375691
[469,] 0.1821397 0.03147246 0.7375691
[470,] 0.1815503 0.03072312 0.7375691
[471,] 0.1815503 0.03034845 0.7389503
[472,] 0.1809608 0.02997377 0.7375691
[473,] 0.1818450 0.03109779 0.7375691
[474,] 0.1821397 0.03184713 0.7361878
[475,] 0.1821397 0.03147246 0.7375691
[476,] 0.1818450 0.03147246 0.7361878
[477,] 0.1812555 0.03072312 0.7361878
[478,] 0.1815503 0.03109779 0.7361878
[479,] 0.1821397 0.03147246 0.7375691
[480,] 0.1812555 0.03034845 0.7375691
[481,] 0.1815503 0.03109779 0.7361878
[482,] 0.1815503 0.03072312 0.7375691
[483,] 0.1815503 0.03072312 0.7375691
[484,] 0.1812555 0.03072312 0.7361878
[485,] 0.1818450 0.03147246 0.7361878
[486,] 0.1821397 0.03147246 0.7375691
[487,] 0.1809608 0.03034845 0.7361878
[488,] 0.1815503 0.03109779 0.7361878
[489,] 0.1818450 0.03109779 0.7375691
[490,] 0.1815503 0.03147246 0.7348066
[491,] 0.1818450 0.03147246 0.7361878
[492,] 0.1818450 0.03147246 0.7361878
[493,] 0.1815503 0.03109779 0.7361878
[494,] 0.1812555 0.03072312 0.7361878
[495,] 0.1812555 0.03109779 0.7348066
[496,] 0.1818450 0.03184713 0.7348066
[497,] 0.1812555 0.03109779 0.7348066
[498,] 0.1815503 0.03109779 0.7361878
[499,] 0.1815503 0.03147246 0.7348066
[500,] 0.1815503 0.03147246 0.7348066
summary (rf_model)
                Length Class  Mode     
call               4   -none- call     
type               1   -none- character
predicted       3393   factor numeric  
err.rate        1500   -none- numeric  
confusion          6   -none- numeric  
votes           6786   matrix numeric  
oob.times       3393   -none- numeric  
classes            2   -none- character
importance        11   -none- numeric  
importanceSD       0   -none- NULL     
localImportance    0   -none- NULL     
proximity          0   -none- NULL     
ntree              1   -none- numeric  
mtry               1   -none- numeric  
forest            14   -none- list     
y               3393   factor numeric  
test               0   -none- NULL     
inbag              0   -none- NULL     
terms              3   terms  call     
# Make predictions on the test set
predictions = predict(rf_model, newdata = test_data)

#model performance
#rf_model$mse
#which.min(mse) #362 trees used 

mean(predictions == test_data$admitted) #81.7
[1] 0.817217
confusionMatrix(predictions, as.factor(test_data$admitted)) #confusion matrix
Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 653 132
         1  23  40
                                          
               Accuracy : 0.8172          
                 95% CI : (0.7895, 0.8427)
    No Information Rate : 0.7972          
    P-Value [Acc > NIR] : 0.07809         
                                          
                  Kappa : 0.2599          
                                          
 Mcnemar's Test P-Value : < 2e-16         
                                          
            Sensitivity : 0.9660          
            Specificity : 0.2326          
         Pos Pred Value : 0.8318          
         Neg Pred Value : 0.6349          
             Prevalence : 0.7972          
         Detection Rate : 0.7700          
   Detection Prevalence : 0.9257          
      Balanced Accuracy : 0.5993          
                                          
       'Positive' Class : 0               
                                          

Research Question 2: Which characteristics of an ER visit are most strongly associated with the highest total ER expenditure?

Library Download

library(tidymodels)
library(glmnet)
library(xgboost)
library(vip)
library(haven)
library(ranger)
library(randomForest)
library(tree)
library(ISLR)
library(dplyr)
library (plotly)
library(ggplot2)
library(dplyr)
library(caret)

Adding in Insurance Information

#Cleaning Data to find Insurance v No Insurance
OGData_clean <- OGData_clean %>%
  mutate(Insurance = case_when(
    (private_fac > 0  |  private_doc > 0 | medicaid_fac > 0 | medicaid_doc > 0 | medicare_fac > 0 | medicare_doc > 0) ~ "1",
    TRUE ~ "0"
  ))

OGData_clean$Insurance <- as.numeric(OGData_clean$Insurance)

Cleaning up binary data Clean More Data - Deal with 95 & 1&2 to 0,1 remove -8 dont know variables 1 is yes 2 is no - changing to 0,1 - no - 0 yes 1

yesno_vars <- c(
  "mri_ct", "surgery", "xray", "lab_tests", "ekg",
  "ultrasound", "mammogram", "vaccination", "rx_given",
  "related_condition")

OGData_clean <- OGData_clean %>%
  mutate(across(all_of(yesno_vars), ~ ifelse(. == 1, 1, 0))) #keeps all the 1's and replaces the rest with 0

#cleaning admitted inpatient for RQ1
OGData_clean$admitted <- ifelse(OGData_clean$admitted == "-1", 0, 1) #-1 not admitted - changing to 0;; recoding 1 to be admitted
table(OGData_clean$admitted) #not admitted 3345; admitted:896

   1 
4241 

RQ2 Question 2. Which characteristics of an ER visit (specific tests, procedures, or diagnoses) are most strongly associated with the highest total ER expenditures? A lot of outliers

Making Dataset for RQ2

RQ2Data <- OGData_clean %>%
  dplyr:: select( admitted,       
    mri_ct      ,
    surgery,
    xray ,
    lab_tests ,
    ekg    ,
    ultrasound ,
    mammogram ,
    vaccination ,
    rx_given ,
    related_condition,
    total_cost)

Exploring Dataset / Characteristics

summary (RQ2Data$total_cost) #Median - 578, Mean 1227.9 Max: 195314.5
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
     0.0    206.7    578.2   1227.9   1272.0 195314.5 
sum(RQ2Data$total_cost == 0, na.rm = TRUE) #367 cases with 0 out of pocket cost
[1] 367
sum(RQ2Data$total_cost != 0, na.rm = TRUE) #3874 cases with more than 0 out of pocket
[1] 3874
RQ2DataViz <- RQ2Data %>%
mutate(cost_group = ifelse(total_cost == 0, "0", ">0"))

#Bar Chart showing majority has more than $0 Costs
ggplotly (ggplot(RQ2DataViz, aes(x = cost_group, fill = cost_group)) +
geom_bar() +
xlab("Total Cost") +
ylab("Count") +
ggtitle("Count of 0 vs >0 Total Cost") +
theme_minimal())
#Boxplot of Total Cost
ggplotly(ggplot(RQ2Data, aes(x = "", y = total_cost)) +
geom_boxplot(fill = "steelblue")
  + # focus on main range
  ggtitle("Total Cost Distribution of ER Visits") +
  xlab("Total Cost ($)") +
  ylab("Count")+
coord_cartesian(ylim = c(0, 5000))) #this was able to show the box because of so many outliers / noise
#showing the distribution of total cost
ggplot(RQ2Data, aes(x = total_cost)) +
  geom_histogram(bins = 50, fill = "steelblue", alpha = 0.7) +
  scale_x_continuous(limits = c(0, 5000)) + # focus on main range
  ggtitle("Total Cost Distribution of ER Visits") +
  xlab("Total Cost ($)") +
  ylab("Count") +
  theme_minimal() 
Warning: Removed 143 rows containing non-finite outside the scale range
(`stat_bin()`).
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_bar()`).

Random Forest Model because a lot of outliers

library(randomForest)
library(caret)

set.seed(1)
train_index1 = createDataPartition(RQ2Data$total_cost, p = 0.7, list = FALSE) 
train_data1 = RQ2Data[train_index1, ]
test_data1 = RQ2Data[-train_index1, ]


rf.RQ2=randomForest(total_cost~.,data=train_data1, ntree = 500, importance = TRUE)
varImpPlot(rf.RQ2)

rf.RQ2

Call:
 randomForest(formula = total_cost ~ ., data = train_data1, ntree = 500,      importance = TRUE) 
               Type of random forest: regression
                     Number of trees: 500
No. of variables tried at each split: 3

          Mean of squared residuals: 18790068
                    % Var explained: -1.74
plot(rf.RQ2)

importance(rf.RQ2)
                      %IncMSE IncNodePurity
admitted            0.0000000             0
mri_ct             -7.7846562     686840954
surgery            -2.4621831     335768047
xray               -7.8670960     415315607
lab_tests          -7.7045082     214092768
ekg               -11.4278188     420924965
ultrasound        -12.4757435     728741962
mammogram          -0.7349283      29174336
vaccination         4.2598684      61253195
rx_given           -7.3649995     309750976
related_condition  -4.4996245      61669431
summary (rf.RQ2)
                Length Class  Mode     
call               5   -none- call     
type               1   -none- character
predicted       2970   -none- numeric  
mse              500   -none- numeric  
rsq              500   -none- numeric  
oob.times       2970   -none- numeric  
importance        22   -none- numeric  
importanceSD      11   -none- numeric  
localImportance    0   -none- NULL     
proximity          0   -none- NULL     
ntree              1   -none- numeric  
mtry               1   -none- numeric  
forest            11   -none- list     
coefs              0   -none- NULL     
y               2970   -none- numeric  
test               0   -none- NULL     
inbag              0   -none- NULL     
terms              3   terms  call     
#REGRESSION ANALYSIS FOR CONTINUOUS VARIABLE
# Make predictions on the test set
predictionsRQ2 = predict(rf.RQ2, newdata = test_data1)

# Evaluate the model performance
# Regression metrics
rmse <- sqrt(mean((test_data1$total_cost - predictionsRQ2)^2)) #RMSE 997.89
mae  <- mean(abs(test_data1$total_cost - predictionsRQ2)) #MAE: 243.08
rsq  <- cor(test_data1$total_cost, predictionsRQ2)^2 #RSQLM: 0.97

Multiple Linear Regression Model

fit = lm(total_cost ~ ., data = train_data1)
pred_lm = predict(fit, newdata = test_data1)

summary(fit)

Call:
lm(formula = total_cost ~ ., data = train_data1)

Residuals:
   Min     1Q Median     3Q    Max 
 -2768   -860   -466    140 192804 

Coefficients: (1 not defined because of singularities)
                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)         805.95     229.23   3.516 0.000445 ***
admitted                NA         NA      NA       NA    
mri_ct              828.17     190.22   4.354 1.38e-05 ***
surgery             257.15     343.81   0.748 0.454558    
xray                 76.41     167.75   0.456 0.648767    
lab_tests           140.77     169.19   0.832 0.405476    
ekg                 243.77     209.98   1.161 0.245757    
ultrasound          502.06     253.63   1.980 0.047850 *  
mammogram          -395.80    1920.73  -0.206 0.836754    
vaccination        -575.54     646.04  -0.891 0.373072    
rx_given            189.97     174.42   1.089 0.276171    
related_condition   -86.66     229.00  -0.378 0.705140    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4280 on 2959 degrees of freedom
Multiple R-squared:  0.01195,   Adjusted R-squared:  0.008614 
F-statistic:  3.58 on 10 and 2959 DF,  p-value: 9.746e-05
sqrt(mean((test_data1$total_cost - pred_lm)^2)) #RMSE 3689.77
[1] 3952.412
mean(abs(test_data1$total_cost - pred_lm)) #MAE: 1147.58
[1] 1253.426
cor(test_data1$total_cost, pred_lm)^2 #RSQLM:0.1324599
[1] 0.005716896

Research Question 3: What impacts the number of tests and procedures a patient gets done in the ER?

Cleaning up binary data Clean More Data - Deal with 95 & 1&2 to 0,1 remove -8 dont know variables 1 is yes 2 is no - changing to 0,1 - no - 0 yes 1

yesno_vars <- c(
  "mri_ct", "surgery", "xray", "lab_tests", "ekg",
  "ultrasound", "mammogram", "vaccination", "rx_given",
  "related_condition")

OGData_clean <- OGData_clean %>%
  mutate(across(all_of(yesno_vars), ~ ifelse(. == 1, 1, 0))) #keeps all the 1's and replaces the rest with 0

#cleaning admitted inpatient for RQ1
OGData_clean$admitted <- ifelse(OGData_clean$admitted == "-1", 0, 1) #-1 not admitted - changing to 0;; recoding 1 to be admitted

Adding in Insurance Information

#Cleaning Data to find Insurance v No Insurance
RQ3Data <- OGData_clean %>%
  mutate(Insurance = case_when(
    (private_fac > 0  |  private_doc > 0 | medicaid_fac > 0 | medicaid_doc > 0 | medicare_fac > 0 | medicare_doc > 0) ~ "1",
    TRUE ~ "0"
  ))

RQ3Data$Insurance <- as.numeric(RQ3Data$Insurance)

#Count the total number of tests / procedures done for pt
test_vars <- c("mri_ct", "surgery", "xray", "lab_tests", 
               "ekg", "ultrasound", "mammogram", 
               "vaccination", "rx_given", "related_condition")

# Create new variable
RQ3Data <- RQ3Data %>%
  rowwise() %>%
  mutate(total_tests = sum(c_across(all_of(test_vars)), na.rm = TRUE)) %>%
  ungroup()

RQ3Data<- RQ3Data %>%
  dplyr:: select(Insurance, total_tests,person_id) #added ID or dupersid to merge later

Merging New Dataset

library(haven)
library(dplyr)

#dowloaded the SAS File & filtering only the variables needed
MergeDataDemo <- read_sas("C:/Users/diane/Downloads/h251.sas7bdat") %>%
  dplyr::select(DUPERSID, SEX,#1 male, 2 female
                POVCAT23) #1 is low income to 5 high

RQ3Data <- RQ3Data %>%
  mutate(person_id = as.numeric(person_id))

MergeDataDemo <- MergeDataDemo %>%
  mutate(DUPERSID = as.numeric(DUPERSID))

MergedDataRQ3 <- RQ3Data %>%
  left_join(MergeDataDemo, by = c("person_id" = "DUPERSID"))

MergedDataRQ3 <- MergedDataRQ3 %>%
   dplyr::select(-person_id)#no need after merge

#changing sex to 0,1 (male 0 female 1)
MergedDataRQ3 <- MergedDataRQ3 %>%
  mutate(SEX = ifelse(SEX == 1, 0,
                      ifelse(SEX == 2, 1, NA)))

#Missing Data
sum(is.na(MergedDataRQ3)) #no missing
[1] 0

Data Visualization

library (ggplot2)
library (plotly)
#making the visualization show yes / no
RQ3Viz <- MergedDataRQ3 %>%
  mutate(Insurance = factor(Insurance,
                            levels = c(0, 1),
                            labels = c("No", "Yes")))


ggplotly(ggplot(RQ3Viz, aes(x = factor(POVCAT23), fill = Insurance)) +
  geom_bar(position = "dodge") +
  labs(x = "Poverty Category", y = "Count",
       title = "Poverty Category by Insurance Status"))
#high income has a higher no insurance rate than the very low

#table of insurance & poverty category
table(RQ3Viz$Insurance, RQ3Viz$POVCAT23)
     
        1   2   3   4   5
  No  144  34  77 211 178
  Yes 833 276 581 913 994
prop.table(table(RQ3Viz$Insurance, RQ3Viz$POVCAT23), margin = 1) * 100 #proporations
     
              1         2         3         4         5
  No  22.360248  5.279503 11.956522 32.763975 27.639752
  Yes 23.158187  7.673061 16.152349 25.382263 27.634140
table (RQ3Viz$Insurance)# Imbalanced data 644 No insurance 3597 Insurance

  No  Yes 
 644 3597 
table (RQ3Viz$total_tests)

   0    1    2    3    4    5    6    7    8 
 145  735 1230 1050  653  306   98   19    5 
str (RQ3Viz) #all numerical
tibble [4,241 × 4] (S3: tbl_df/tbl/data.frame)
 $ Insurance  : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 2 2 2 2 ...
 $ total_tests: num [1:4241] 3 3 2 5 3 3 3 2 3 3 ...
 $ SEX        : num [1:4241] 0 1 0 0 0 1 1 1 1 0 ...
 $ POVCAT23   : num [1:4241] 1 5 3 3 5 3 5 5 4 4 ...
  ..- attr(*, "label")= chr "FAMILY INC AS % OF POVERTY LINE - CATEGORICAL"
prop.table(table(RQ3Viz$Insurance, RQ3Viz$total_tests), margin = 1) * 100
     
                0           1           2           3           4           5
  No   2.95031056 16.92546584 27.95031056 21.27329193 16.77018634  8.69565217
  Yes  3.50291910 17.40339172 29.19099249 25.38226300 15.15151515  6.95023631
     
                6           7           8
  No   3.88198758  1.24223602  0.31055901
  Yes  2.02946900  0.30581040  0.08340284
#left skewed normal distribution - try to add a line showing the distribution - create different one then overlap?
ggplotly(ggplot(RQ3Viz, aes(x = factor(total_tests), fill = factor(Insurance))) +
  geom_bar(position = "dodge") +
  labs(
    title = "Number of ER Tests by Insurance Status",
    x = "Total Tests/Procedures",
    y = "Number of Patients",
    fill = "Insurance"
  ) +
  theme_minimal())

Multiple Linear Regression

library (caret)

set.seed(1)
train_index3 <- createDataPartition(MergedDataRQ3$total_tests, 
                                    p = 0.7, 
                                    list = FALSE)

# Training and testing datasets
train_data3 <- MergedDataRQ3[train_index3, ]
test_data3 <- MergedDataRQ3[-train_index3, ]

#traing on train data
fit3 <- lm(total_tests ~ factor(Insurance) + factor(SEX) + factor(POVCAT23),
           data = train_data3)
#Test on test
pred_lm3 = predict(fit3, newdata = test_data3)

summary(fit3)

Call:
lm(formula = total_tests ~ factor(Insurance) + factor(SEX) + 
    factor(POVCAT23), data = train_data3)

Residuals:
    Min      1Q  Median      3Q     Max 
-2.9708 -0.8006  0.0292  1.0552  5.2626 

Coefficients:
                   Estimate Std. Error t value Pr(>|t|)    
(Intercept)         2.73161    0.08486  32.188  < 2e-16 ***
factor(Insurance)1 -0.17027    0.07134  -2.387 0.017067 *  
factor(SEX)1       -0.02602    0.05236  -0.497 0.619282    
factor(POVCAT23)2   0.11644    0.11075   1.051 0.293184    
factor(POVCAT23)3   0.02835    0.08245   0.344 0.731004    
factor(POVCAT23)4   0.00577    0.07258   0.079 0.936645    
factor(POVCAT23)5   0.23923    0.07195   3.325 0.000896 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.386 on 2963 degrees of freedom
Multiple R-squared:  0.007598,  Adjusted R-squared:  0.005588 
F-statistic: 3.781 on 6 and 2963 DF,  p-value: 0.0009368
#Performance on Test Data
RMSE(pred_lm3, test_data3$total_tests) #1.37 RMSE
[1] 1.373271
mean(abs(test_data3$total_tests - pred_lm3)) #1.115986 MAE
[1] 1.115986
cor(test_data3$total_tests, pred_lm3)^2 #0.007887108 RSQLM
[1] 0.007887108
var(test_data3$total_tests) #1.9 most people in the data has 1.9 tests done - mean with fewer high/lower; low variance
[1] 1.902351
#hm = lm(Insurance ~ SEX, data = MergedDataRQ3)
#summary (hm) #Being female (SEX = 1) increases probability of being insured by ~6.6%, holding everything else constant.

Poisson regression

set.seed(1)

#Poisson Model on Train Data
model_pois <- glm(total_tests ~ factor(Insurance) + factor(SEX) + factor(POVCAT23),
                  family = poisson(link = "log"), data = train_data3)
#making prediction on test data
Poss_pred_lm3 = predict(model_pois, newdata = test_data3,type = "response")

summary(model_pois)

Call:
glm(formula = total_tests ~ factor(Insurance) + factor(SEX) + 
    factor(POVCAT23), family = poisson(link = "log"), data = train_data3)

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)         1.003330   0.037225  26.953  < 2e-16 ***
factor(Insurance)1 -0.062767   0.030934  -2.029  0.04245 *  
factor(SEX)1       -0.009803   0.023172  -0.423  0.67225    
factor(POVCAT23)2   0.044392   0.049095   0.904  0.36588    
factor(POVCAT23)3   0.010936   0.037021   0.295  0.76768    
factor(POVCAT23)4   0.002358   0.032623   0.072  0.94237    
factor(POVCAT23)5   0.088898   0.031737   2.801  0.00509 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 2338.2  on 2969  degrees of freedom
Residual deviance: 2322.0  on 2963  degrees of freedom
AIC: 10379

Number of Fisher Scoring iterations: 4
RMSE(test_data3$total_tests, Poss_pred_lm3) #1.373308 RMSE
[1] 1.373308
mean(abs(test_data3$total_tests - Poss_pred_lm3)) #1.116015 MAE
[1] 1.116015
cor(test_data3$total_tests, Poss_pred_lm3)^2 #0.007838005 RSQLM
[1] 0.007833137

Random Forest Regression

library(randomForest)

train_data3 <- MergedDataRQ3[train_index3, ]
test_data3 <- MergedDataRQ3[-train_index3, ]

rf.RQ3=randomForest(total_tests~.,data=train_data3, ntree = 500, importance = TRUE)

#REGRESSION ANALYSIS FOR CONTINUOUS VARIABLE
# Make predictions on the test set
predictionsRQ3 = predict(rf.RQ3, newdata = test_data3)

varImpPlot(rf.RQ3)

rf.RQ3

Call:
 randomForest(formula = total_tests ~ ., data = train_data3, ntree = 500,      importance = TRUE) 
               Type of random forest: regression
                     Number of trees: 500
No. of variables tried at each split: 1

          Mean of squared residuals: 1.922564
                    % Var explained: 0.42
plot(rf.RQ3)

importance(rf.RQ3)
            %IncMSE IncNodePurity
Insurance  8.256505     12.065347
SEX        4.307670      7.157186
POVCAT23  18.839742     31.904179
summary (rf.RQ3)
                Length Class  Mode     
call               5   -none- call     
type               1   -none- character
predicted       2970   -none- numeric  
mse              500   -none- numeric  
rsq              500   -none- numeric  
oob.times       2970   -none- numeric  
importance         6   -none- numeric  
importanceSD       3   -none- numeric  
localImportance    0   -none- NULL     
proximity          0   -none- NULL     
ntree              1   -none- numeric  
mtry               1   -none- numeric  
forest            11   -none- list     
coefs              0   -none- NULL     
y               2970   -none- numeric  
test               0   -none- NULL     
inbag              0   -none- NULL     
terms              3   terms  call     
# Evaluate the model performance on test
# Regression metrics
sqrt(mean((test_data3$total_tests - predictionsRQ3)^2)) #RMSE 1.371164
[1] 1.372646
mean(abs(test_data3$total_tests - predictionsRQ3)) #MAE: 1.116
[1] 1.117072
cor(test_data3$total_tests , predictionsRQ3)^2 #RSQLM: 0.01054665
[1] 0.0104738