Post

당뇨 합병증 예측 알고리즘

당뇨 합병증 예측 알고리즘

필요한 라이브러리 로드

library(tidyverse)
library(caret)
library(randomForest)
library(smotefamily)  # 불균형 데이터 해결
library(fastDummies)  # One-hot encoding
library(pROC)  # ROC Curve
library(reshape2)
library(glmnet)  # Lasso 및 Ridge 회귀

set.seed(300)

✅ 1. 데이터 생성

data <- data.frame(
  Characteristic = c(
    rep("Gender", 2),
    rep("Age_group", 3),
    rep("Residential_area", 3),
    rep("Income", 3),
    rep("DM_duration", 3),
    rep("BMI_group", 4),
    rep("SBP_group", 2),
    rep("DBP_group", 2),
    rep("FBS_group", 2),
    rep("Cholesterol_group", 2),
    rep("Proteinuria", 2),
    rep("Smoking", 3),
    rep("Alcohol", 3),
    rep("Physical_activity", 5),
    rep("Medication", 2)
  ),
  
  Category = c(
    "Men", "Women",
    "30–49", "50–69", "70–89",
    "Capital", "Metropolitan", "Non-metropolitan",
    "1–3", "4–6", "7–10",
    "4~<7", "7~<10", "≥10",
    "<18.5", "18.5~22.9", "23.0~24.9", "≥25",
    "<130", "≥130",
    "<85", "≥85",
    "<100", "≥100",
    "<220", "≥220",
    "Negative", "Positive",
    "Never", "Quit", "Current",
    "<1/month", "1-4/month", "≥Twice/week",
    "None", "1-2", "3-4", "5-6", "Everyday",
    "<80", "≥80"
  ),
  
  Complication_No = c(
    180, 35,
    99, 108, 8,
    29, 77, 109,
    45, 38, 132,
    6, 76, 133,
    1, 63, 51, 100,
    95, 120,
    140, 75,
    39, 176,
    145, 70,
    204, 11,
    111, 24, 80,
    81, 99, 35,
    89, 70, 32, 6, 18,
    53, 162
  ),
  
  Complication_Yes = c(
    886, 306,
    365, 739, 88,
    204, 322, 666,
    271, 239, 682,
    26, 351, 815,
    10, 267, 319, 596,
    470, 722,
    721, 471,
    235, 957,
    800, 392,
    1125, 67,
    712, 124, 356,
    557, 483, 152,
    521, 353, 179, 47, 92,
    334, 858
  )
)

✅ 2. 데이터 변환 (long format)

data_long <- data %>%
  pivot_longer(
    cols = c(Complication_No, Complication_Yes),
    names_to = "Complication",
    values_to = "Count"
  ) %>%
  uncount(Count) %>%
  mutate(
    Complication = ifelse(Complication == "Complication_Yes", 1, 0),  # 1: Yes, 0: No
    Complication = as.factor(Complication),
    Characteristic = as.factor(Characteristic),
    Category = as.factor(Category)
  ) %>%
  select(-Characteristic)  # Characteristic 컬럼 제거

✅ 3. Train/Test 데이터 분할 (70:30)

set.seed(300)
train_index <- createDataPartition(data_long$Complication, p = 0.7, list = FALSE)
train_data <- data_long[train_index, ]
test_data <- data_long[-train_index, ]

✅ 4. One-hot Encoding (모든 범주형 변수 변환)

train_encoded <- dummy_cols(train_data, select_columns = "Category", remove_first_dummy = TRUE, remove_selected_columns = TRUE)
test_encoded <- dummy_cols(test_data, select_columns = "Category", remove_first_dummy = TRUE, remove_selected_columns = TRUE)

✅ 5. 변수명 변환 (특수 문자 처리)

fix_column_names <- function(df) {
  colnames(df) <- gsub("<", "Less_", colnames(df))
  colnames(df) <- gsub(">", "More_", colnames(df))
  colnames(df) <- gsub("~", "Range_", colnames(df))
  colnames(df) <- gsub("-", "_", colnames(df))
  colnames(df) <- make.names(colnames(df), unique = TRUE)
  return(df)
}

train_encoded <- fix_column_names(train_encoded)
test_encoded <- fix_column_names(test_encoded)

✅ 6. 데이터 스케일링 (SMOTE 이전 수행)

preProcess_model <- preProcess(train_encoded, method = c("center", "scale"))
train_scaled <- predict(preProcess_model, train_encoded)
test_scaled <- predict(preProcess_model, test_encoded)

✅ 7. SMOTE 적용 (스케일링된 훈련 데이터 사용)

smote_train <- SMOTE(train_scaled[,-1], train_scaled$Complication)
train_balanced <- smote_train$data %>%
  rename(Complication = class) %>%
  mutate(Complication = as.factor(Complication))

  test_balanced <- test_scaled

📌 SMOTE 적용 전 클래스 비율 확인 table(data_long$Complication) table(train_scaled$Complication) # 원본 데이터 비율

📌 SMOTE 적용 후 클래스 비율 확인 table(train_balanced$Complication) # SMOTE 적용 후 확인 table(test_balanced$Complication)

✅ 8. 로지스틱 회귀 모델 학습

logit_model <- glm(Complication ~ ., family = binomial, data = train_balanced)
summary(logit_model)

✅ 9. 랜덤 포레스트 모델 학습

rf_model <- randomForest(Complication ~ ., data = train_balanced, ntree = 500, importance = TRUE)
print(rf_model)

✅ 10. 모델 평가 함수

evaluate_model <- function(model, test_data) {
  # 예측값 계산 (로지스틱 회귀 vs 랜덤 포레스트)
  if ("glm" %in% class(model)) {
    pred_prob <- predict(model, newdata = test_data, type = "response")  # 확률 예측
    pred_class <- ifelse(pred_prob > 0.5, 1, 0)  # 0 또는 1로 변환
  } else if ("randomForest" %in% class(model)) {
    pred_class <- predict(model, newdata = test_data, type = "class")  # 이미 0 또는 1 형태
  }
  
  # factor 형식으로 변환 (클래스 일치)
  pred_class <- as.factor(pred_class)
  actual_class <- as.factor(test_data$Complication)
  
  # Confusion Matrix 계산
  cm <- confusionMatrix(pred_class, actual_class)
  print(cm)
}

✅ 11. 모델 평가 실행

evaluate_model(logit_model, test_scaled)
evaluate_model(rf_model, test_scaled)

✅ 12. ROC Curve 및 AUC 계산

pred_prob <- predict(logit_model, newdata = test_scaled, type = "response")
roc_curve <- roc(test_scaled$Complication, pred_prob)
plot(roc_curve, col = "blue", main = "ROC Curve (Logistic Regression)")
print(auc(roc_curve))

rf_pred_prob <- predict(rf_model, newdata = test_scaled, type = "prob")[,2]
rf_roc_curve <- roc(test_scaled$Complication, rf_pred_prob)
plot(rf_roc_curve, col = "red", main = "ROC Curve (Random Forest)")
print(auc(rf_roc_curve))

✅ 13. 랜덤 포레스트 변수 중요도 확인

importance_vals <- importance(rf_model) %>%
  as.data.frame() %>%
  rownames_to_column("Feature") %>%
  arrange(desc(MeanDecreaseGini))

# 상위 20개 변수 시각화
ggplot(head(importance_vals, 20), aes(x = reorder(Feature, MeanDecreaseGini), y = MeanDecreaseGini)) +
  geom_col(fill = "steelblue", width = 0.7) +
  coord_flip() +
  theme_minimal() +
  labs(title = "Top 20 Important Features (Random Forest)",
       x = "Feature",
       y = "Mean Decrease in Gini")

# 중요한 변수만 남기기

logit_summary <- summary(logit_model)
p_values <- logit_summary$coefficients[, 4]
# p-value가 0.05 미만인 변수 선택
significant_vars <- names(which(logit_summary$coefficients[, 4] < 0.05))

# Intercept(절편) 제거
significant_vars <- significant_vars[!significant_vars %in% "(Intercept)"]

# 선택된 변수 출력
print(significant_vars)


rf_importance <- importance(rf_model) %>%
  as.data.frame() %>%
  rownames_to_column("Feature") %>%
  arrange(desc(MeanDecreaseGini))  # 중요도 높은 순 정렬

# 상위 10개 변수 확인
top_rf_vars <- head(rf_importance$Feature, 10)
print(top_rf_vars)

# 로지스틱 회귀 & 랜덤 포레스트 공통 변수 찾기
selected_vars <- intersect(significant_vars, top_rf_vars)

print("최종 선택된 변수 목록:")
print(selected_vars)

# 선택된 변수만 포함한 데이터셋 생성
train_selected <- train_balanced %>% select(all_of(c("Complication", selected_vars)))
test_selected <- test_scaled %>% select(all_of(c("Complication", selected_vars)))

# 로지스틱 회귀 모델 재학습
logit_model_selected <- glm(Complication ~ ., family = binomial, data = train_selected)
summary(logit_model_selected)

# 랜덤 포레스트 모델 재학습
rf_model_selected <- randomForest(Complication ~ ., 
                                  data = train_selected, 
                                  ntree = 500, 
                                  importance = TRUE)


evaluate_model(logit_model_selected, test_scaled)
evaluate_model(rf_model_selected, test_scaled)

# 로지스틱 회귀 ROC Curve
pred_prob_selected <- predict(logit_model_selected, newdata = test_selected, type = "response")
roc_curve_selected <- roc(test_selected$Complication, pred_prob_selected)
plot(roc_curve_selected, col = "blue", main = "ROC Curve (Selected Features - Logistic Regression)")
print(auc(roc_curve_selected))

# 랜덤 포레스트 ROC Curve
rf_pred_prob_selected <- predict(rf_model_selected, newdata = test_selected, type = "prob")[,2]
rf_roc_curve_selected <- roc(test_selected$Complication, rf_pred_prob_selected)
plot(rf_roc_curve_selected, col = "red", main = "ROC Curve (Selected Features - Random Forest)")
print(auc(rf_roc_curve_selected))

✅ 14. Lasso 및 Ridge 회귀 적용

x <- model.matrix(Complication ~ ., train_balanced)[,-1]
y <- as.numeric(train_balanced$Complication) - 1  # 0과 1 변환

# Lasso (L1 정규화)
cv_lasso <- cv.glmnet(x, y, alpha = 1, family = "binomial")  # alpha=1 → Lasso
lasso_model <- glmnet(x, y, alpha = 1, lambda = cv_lasso$lambda.min)

# Ridge (L2 정규화)
cv_ridge <- cv.glmnet(x, y, alpha = 0, family = "binomial")  # alpha=0 → Ridge
ridge_model <- glmnet(x, y, alpha = 0, lambda = cv_ridge$lambda.min)

print(cv_lasso$lambda.min)  # 최적의 라쏘 정규화 파라미터
print(cv_ridge$lambda.min)  # 최적의 릿지 정규화 파라미터


#  Lasso 모델 학습 및 평가
lasso_model <- glmnet(x, y, alpha = 1, lambda = cv_lasso$lambda.min)

# Lasso 모델 예측
lasso_pred_prob <- predict(lasso_model, newx = model.matrix(Complication ~ ., test_scaled)[,-1], type = "response")
lasso_pred_class <- ifelse(lasso_pred_prob > 0.5, 1, 0)

# Lasso 모델 평가
lasso_cm <- confusionMatrix(as.factor(lasso_pred_class), as.factor(test_scaled$Complication))
print(lasso_cm)

# Lasso ROC Curve 및 AUC
lasso_roc <- roc(test_scaled$Complication, lasso_pred_prob)
plot(lasso_roc, col = "blue", main = "ROC Curve (Lasso)")
print(auc(lasso_roc))

✅ 2Ridge 모델 학습 및 평가

ridge_model <- glmnet(x, y, alpha = 0, lambda = cv_ridge$lambda.min)

# Ridge 모델 예측
ridge_pred_prob <- predict(ridge_model, newx = model.matrix(Complication ~ ., test_scaled)[,-1], type = "response")
ridge_pred_class <- ifelse(ridge_pred_prob > 0.5, 1, 0)

# Ridge 모델 평가
ridge_cm <- confusionMatrix(as.factor(ridge_pred_class), as.factor(test_scaled$Complication))
print(ridge_cm)

# Ridge ROC Curve 및 AUC
ridge_roc <- roc(test_scaled$Complication, ridge_pred_prob)
plot(ridge_roc, col = "red", main = "ROC Curve (Ridge)")
print(auc(ridge_roc))

✅ 15. 의사결정나무 모델 추가

library(rpart)
library(rpart.plot)

# 기본 의사결정나무
tree_model <- rpart(Complication ~ ., 
                    data = train_balanced,
                    method = "class",
                    control = rpart.control(cp = 0.001, minsplit = 20))

# 가지치기 수행 (최적 cp 값 찾기)
plotcp(tree_model)
best_cp <- tree_model$cptable[which.min(tree_model$cptable[,"xerror"]),"CP"]
pruned_tree <- prune(tree_model, cp = best_cp)

# 모델 시각화
rpart.plot(pruned_tree, 
           extra = 104, 
           box.palette = "GnBu",
           shadow.col = "gray",
           nn = TRUE)

# 성능 평가
tree_pred <- predict(pruned_tree, test_scaled, type = "class")
tree_prob <- predict(pruned_tree, test_scaled, type = "prob")[,2]

# 혼동 행렬
print(confusionMatrix(tree_pred, test_scaled$Complication))

# ROC 곡선
tree_roc <- roc(test_scaled$Complication, tree_prob)
plot(tree_roc, col="green", main="ROC Curve (Decision Tree)")
auc(tree_roc)

✅ 16. 다중공선성

library(car)

# 다중공선성 확인 (VIF 계산)
logit_model <- glm(Complication ~ ., family = binomial, data = train_balanced)
vif_values <- car::vif(logit_model)
print(vif_values)

# VIF 시각화
barplot(vif_values, col = "steelblue", horiz = TRUE, las = 1)
abline(v = 2, col = "red", lty = 2)

# VIF > 2 인 변수 제거 (보통 5~10인데 차이가 없어서)
high_vif <- names(which(vif_values > 2))
train_reduced <- train_balanced %>% select(-all_of(high_vif))
test_reduced <- test_balanced %>% select(-all_of(high_vif))

# 데이터 정규화 (표준화)
preProcess_model <- preProcess(train_reduced, method = c("center", "scale"))
train_normalized <- predict(preProcess_model, train_reduced)
test_normalized <- predict(preProcess_model, test_reduced)

# 모델 재학습
logit_reduced <- glm(Complication ~ ., family = binomial, data = train_normalized)
evaluate_model(logit_reduced, test_balanced)


# ✅ 회귀 모델 평가 (AUC 포함)
pred_prob <- predict(logit_reduced, newdata = test_balanced, type = "response")

# ✅ ROC Curve 및 AUC 계산
roc_curve <- roc(test_balanced$Complication, pred_prob)
auc_value <- auc(roc_curve)

# ✅ 결과 출력
print(auc_value)  # AUC 값 출력

결론

: 현재 모델들은 실제 임상적 의미 있는 예측 능력이 없으며, 데이터의 근본적 한계(예측변수-결과 간 연관성 부족)가 주된 원인입니다. 의사결정나무 시각화 결과에서도 복잡한 규칙 없이 무작위 분할이 관찰됩니다. 해결을 위해서는 더 의미 있는 예측변수 수집과 전문가 피드백이 필수적입니다.

This post is licensed under CC BY 4.0 by the author.