당뇨 합병증 예측 알고리즘
당뇨 합병증 예측 알고리즘
필요한 라이브러리 로드
library(tidyverse)
library(caret)
library(randomForest)
library(smotefamily) # 불균형 데이터 해결
library(fastDummies) # One-hot encoding
library(pROC) # ROC Curve
library(reshape2)
library(glmnet) # Lasso 및 Ridge 회귀
set.seed(300)
✅ 1. 데이터 생성
data <- data.frame(
Characteristic = c(
rep("Gender", 2),
rep("Age_group", 3),
rep("Residential_area", 3),
rep("Income", 3),
rep("DM_duration", 3),
rep("BMI_group", 4),
rep("SBP_group", 2),
rep("DBP_group", 2),
rep("FBS_group", 2),
rep("Cholesterol_group", 2),
rep("Proteinuria", 2),
rep("Smoking", 3),
rep("Alcohol", 3),
rep("Physical_activity", 5),
rep("Medication", 2)
),
Category = c(
"Men", "Women",
"30–49", "50–69", "70–89",
"Capital", "Metropolitan", "Non-metropolitan",
"1–3", "4–6", "7–10",
"4~<7", "7~<10", "≥10",
"<18.5", "18.5~22.9", "23.0~24.9", "≥25",
"<130", "≥130",
"<85", "≥85",
"<100", "≥100",
"<220", "≥220",
"Negative", "Positive",
"Never", "Quit", "Current",
"<1/month", "1-4/month", "≥Twice/week",
"None", "1-2", "3-4", "5-6", "Everyday",
"<80", "≥80"
),
Complication_No = c(
180, 35,
99, 108, 8,
29, 77, 109,
45, 38, 132,
6, 76, 133,
1, 63, 51, 100,
95, 120,
140, 75,
39, 176,
145, 70,
204, 11,
111, 24, 80,
81, 99, 35,
89, 70, 32, 6, 18,
53, 162
),
Complication_Yes = c(
886, 306,
365, 739, 88,
204, 322, 666,
271, 239, 682,
26, 351, 815,
10, 267, 319, 596,
470, 722,
721, 471,
235, 957,
800, 392,
1125, 67,
712, 124, 356,
557, 483, 152,
521, 353, 179, 47, 92,
334, 858
)
)
✅ 2. 데이터 변환 (long format)
data_long <- data %>%
pivot_longer(
cols = c(Complication_No, Complication_Yes),
names_to = "Complication",
values_to = "Count"
) %>%
uncount(Count) %>%
mutate(
Complication = ifelse(Complication == "Complication_Yes", 1, 0), # 1: Yes, 0: No
Complication = as.factor(Complication),
Characteristic = as.factor(Characteristic),
Category = as.factor(Category)
) %>%
select(-Characteristic) # Characteristic 컬럼 제거
✅ 3. Train/Test 데이터 분할 (70:30)
set.seed(300)
train_index <- createDataPartition(data_long$Complication, p = 0.7, list = FALSE)
train_data <- data_long[train_index, ]
test_data <- data_long[-train_index, ]
✅ 4. One-hot Encoding (모든 범주형 변수 변환)
train_encoded <- dummy_cols(train_data, select_columns = "Category", remove_first_dummy = TRUE, remove_selected_columns = TRUE)
test_encoded <- dummy_cols(test_data, select_columns = "Category", remove_first_dummy = TRUE, remove_selected_columns = TRUE)
✅ 5. 변수명 변환 (특수 문자 처리)
fix_column_names <- function(df) {
colnames(df) <- gsub("<", "Less_", colnames(df))
colnames(df) <- gsub(">", "More_", colnames(df))
colnames(df) <- gsub("~", "Range_", colnames(df))
colnames(df) <- gsub("-", "_", colnames(df))
colnames(df) <- make.names(colnames(df), unique = TRUE)
return(df)
}
train_encoded <- fix_column_names(train_encoded)
test_encoded <- fix_column_names(test_encoded)
✅ 6. 데이터 스케일링 (SMOTE 이전 수행)
preProcess_model <- preProcess(train_encoded, method = c("center", "scale"))
train_scaled <- predict(preProcess_model, train_encoded)
test_scaled <- predict(preProcess_model, test_encoded)
✅ 7. SMOTE 적용 (스케일링된 훈련 데이터 사용)
smote_train <- SMOTE(train_scaled[,-1], train_scaled$Complication)
train_balanced <- smote_train$data %>%
rename(Complication = class) %>%
mutate(Complication = as.factor(Complication))
test_balanced <- test_scaled
📌 SMOTE 적용 전 클래스 비율 확인 table(data_long$Complication) table(train_scaled$Complication) # 원본 데이터 비율
📌 SMOTE 적용 후 클래스 비율 확인 table(train_balanced$Complication) # SMOTE 적용 후 확인 table(test_balanced$Complication)
✅ 8. 로지스틱 회귀 모델 학습
logit_model <- glm(Complication ~ ., family = binomial, data = train_balanced)
summary(logit_model)
✅ 9. 랜덤 포레스트 모델 학습
rf_model <- randomForest(Complication ~ ., data = train_balanced, ntree = 500, importance = TRUE)
print(rf_model)
✅ 10. 모델 평가 함수
evaluate_model <- function(model, test_data) {
# 예측값 계산 (로지스틱 회귀 vs 랜덤 포레스트)
if ("glm" %in% class(model)) {
pred_prob <- predict(model, newdata = test_data, type = "response") # 확률 예측
pred_class <- ifelse(pred_prob > 0.5, 1, 0) # 0 또는 1로 변환
} else if ("randomForest" %in% class(model)) {
pred_class <- predict(model, newdata = test_data, type = "class") # 이미 0 또는 1 형태
}
# factor 형식으로 변환 (클래스 일치)
pred_class <- as.factor(pred_class)
actual_class <- as.factor(test_data$Complication)
# Confusion Matrix 계산
cm <- confusionMatrix(pred_class, actual_class)
print(cm)
}
✅ 11. 모델 평가 실행
evaluate_model(logit_model, test_scaled)
evaluate_model(rf_model, test_scaled)
✅ 12. ROC Curve 및 AUC 계산
pred_prob <- predict(logit_model, newdata = test_scaled, type = "response")
roc_curve <- roc(test_scaled$Complication, pred_prob)
plot(roc_curve, col = "blue", main = "ROC Curve (Logistic Regression)")
print(auc(roc_curve))
rf_pred_prob <- predict(rf_model, newdata = test_scaled, type = "prob")[,2]
rf_roc_curve <- roc(test_scaled$Complication, rf_pred_prob)
plot(rf_roc_curve, col = "red", main = "ROC Curve (Random Forest)")
print(auc(rf_roc_curve))
✅ 13. 랜덤 포레스트 변수 중요도 확인
importance_vals <- importance(rf_model) %>%
as.data.frame() %>%
rownames_to_column("Feature") %>%
arrange(desc(MeanDecreaseGini))
# 상위 20개 변수 시각화
ggplot(head(importance_vals, 20), aes(x = reorder(Feature, MeanDecreaseGini), y = MeanDecreaseGini)) +
geom_col(fill = "steelblue", width = 0.7) +
coord_flip() +
theme_minimal() +
labs(title = "Top 20 Important Features (Random Forest)",
x = "Feature",
y = "Mean Decrease in Gini")
# 중요한 변수만 남기기
logit_summary <- summary(logit_model)
p_values <- logit_summary$coefficients[, 4]
# p-value가 0.05 미만인 변수 선택
significant_vars <- names(which(logit_summary$coefficients[, 4] < 0.05))
# Intercept(절편) 제거
significant_vars <- significant_vars[!significant_vars %in% "(Intercept)"]
# 선택된 변수 출력
print(significant_vars)
rf_importance <- importance(rf_model) %>%
as.data.frame() %>%
rownames_to_column("Feature") %>%
arrange(desc(MeanDecreaseGini)) # 중요도 높은 순 정렬
# 상위 10개 변수 확인
top_rf_vars <- head(rf_importance$Feature, 10)
print(top_rf_vars)
# 로지스틱 회귀 & 랜덤 포레스트 공통 변수 찾기
selected_vars <- intersect(significant_vars, top_rf_vars)
print("최종 선택된 변수 목록:")
print(selected_vars)
# 선택된 변수만 포함한 데이터셋 생성
train_selected <- train_balanced %>% select(all_of(c("Complication", selected_vars)))
test_selected <- test_scaled %>% select(all_of(c("Complication", selected_vars)))
# 로지스틱 회귀 모델 재학습
logit_model_selected <- glm(Complication ~ ., family = binomial, data = train_selected)
summary(logit_model_selected)
# 랜덤 포레스트 모델 재학습
rf_model_selected <- randomForest(Complication ~ .,
data = train_selected,
ntree = 500,
importance = TRUE)
evaluate_model(logit_model_selected, test_scaled)
evaluate_model(rf_model_selected, test_scaled)
# 로지스틱 회귀 ROC Curve
pred_prob_selected <- predict(logit_model_selected, newdata = test_selected, type = "response")
roc_curve_selected <- roc(test_selected$Complication, pred_prob_selected)
plot(roc_curve_selected, col = "blue", main = "ROC Curve (Selected Features - Logistic Regression)")
print(auc(roc_curve_selected))
# 랜덤 포레스트 ROC Curve
rf_pred_prob_selected <- predict(rf_model_selected, newdata = test_selected, type = "prob")[,2]
rf_roc_curve_selected <- roc(test_selected$Complication, rf_pred_prob_selected)
plot(rf_roc_curve_selected, col = "red", main = "ROC Curve (Selected Features - Random Forest)")
print(auc(rf_roc_curve_selected))
✅ 14. Lasso 및 Ridge 회귀 적용
x <- model.matrix(Complication ~ ., train_balanced)[,-1]
y <- as.numeric(train_balanced$Complication) - 1 # 0과 1 변환
# Lasso (L1 정규화)
cv_lasso <- cv.glmnet(x, y, alpha = 1, family = "binomial") # alpha=1 → Lasso
lasso_model <- glmnet(x, y, alpha = 1, lambda = cv_lasso$lambda.min)
# Ridge (L2 정규화)
cv_ridge <- cv.glmnet(x, y, alpha = 0, family = "binomial") # alpha=0 → Ridge
ridge_model <- glmnet(x, y, alpha = 0, lambda = cv_ridge$lambda.min)
print(cv_lasso$lambda.min) # 최적의 라쏘 정규화 파라미터
print(cv_ridge$lambda.min) # 최적의 릿지 정규화 파라미터
# Lasso 모델 학습 및 평가
lasso_model <- glmnet(x, y, alpha = 1, lambda = cv_lasso$lambda.min)
# Lasso 모델 예측
lasso_pred_prob <- predict(lasso_model, newx = model.matrix(Complication ~ ., test_scaled)[,-1], type = "response")
lasso_pred_class <- ifelse(lasso_pred_prob > 0.5, 1, 0)
# Lasso 모델 평가
lasso_cm <- confusionMatrix(as.factor(lasso_pred_class), as.factor(test_scaled$Complication))
print(lasso_cm)
# Lasso ROC Curve 및 AUC
lasso_roc <- roc(test_scaled$Complication, lasso_pred_prob)
plot(lasso_roc, col = "blue", main = "ROC Curve (Lasso)")
print(auc(lasso_roc))
✅ 2Ridge 모델 학습 및 평가
ridge_model <- glmnet(x, y, alpha = 0, lambda = cv_ridge$lambda.min)
# Ridge 모델 예측
ridge_pred_prob <- predict(ridge_model, newx = model.matrix(Complication ~ ., test_scaled)[,-1], type = "response")
ridge_pred_class <- ifelse(ridge_pred_prob > 0.5, 1, 0)
# Ridge 모델 평가
ridge_cm <- confusionMatrix(as.factor(ridge_pred_class), as.factor(test_scaled$Complication))
print(ridge_cm)
# Ridge ROC Curve 및 AUC
ridge_roc <- roc(test_scaled$Complication, ridge_pred_prob)
plot(ridge_roc, col = "red", main = "ROC Curve (Ridge)")
print(auc(ridge_roc))
✅ 15. 의사결정나무 모델 추가
library(rpart)
library(rpart.plot)
# 기본 의사결정나무
tree_model <- rpart(Complication ~ .,
data = train_balanced,
method = "class",
control = rpart.control(cp = 0.001, minsplit = 20))
# 가지치기 수행 (최적 cp 값 찾기)
plotcp(tree_model)
best_cp <- tree_model$cptable[which.min(tree_model$cptable[,"xerror"]),"CP"]
pruned_tree <- prune(tree_model, cp = best_cp)
# 모델 시각화
rpart.plot(pruned_tree,
extra = 104,
box.palette = "GnBu",
shadow.col = "gray",
nn = TRUE)
# 성능 평가
tree_pred <- predict(pruned_tree, test_scaled, type = "class")
tree_prob <- predict(pruned_tree, test_scaled, type = "prob")[,2]
# 혼동 행렬
print(confusionMatrix(tree_pred, test_scaled$Complication))
# ROC 곡선
tree_roc <- roc(test_scaled$Complication, tree_prob)
plot(tree_roc, col="green", main="ROC Curve (Decision Tree)")
auc(tree_roc)
✅ 16. 다중공선성
library(car)
# 다중공선성 확인 (VIF 계산)
logit_model <- glm(Complication ~ ., family = binomial, data = train_balanced)
vif_values <- car::vif(logit_model)
print(vif_values)
# VIF 시각화
barplot(vif_values, col = "steelblue", horiz = TRUE, las = 1)
abline(v = 2, col = "red", lty = 2)
# VIF > 2 인 변수 제거 (보통 5~10인데 차이가 없어서)
high_vif <- names(which(vif_values > 2))
train_reduced <- train_balanced %>% select(-all_of(high_vif))
test_reduced <- test_balanced %>% select(-all_of(high_vif))
# 데이터 정규화 (표준화)
preProcess_model <- preProcess(train_reduced, method = c("center", "scale"))
train_normalized <- predict(preProcess_model, train_reduced)
test_normalized <- predict(preProcess_model, test_reduced)
# 모델 재학습
logit_reduced <- glm(Complication ~ ., family = binomial, data = train_normalized)
evaluate_model(logit_reduced, test_balanced)
# ✅ 회귀 모델 평가 (AUC 포함)
pred_prob <- predict(logit_reduced, newdata = test_balanced, type = "response")
# ✅ ROC Curve 및 AUC 계산
roc_curve <- roc(test_balanced$Complication, pred_prob)
auc_value <- auc(roc_curve)
# ✅ 결과 출력
print(auc_value) # AUC 값 출력
결론
: 현재 모델들은 실제 임상적 의미 있는 예측 능력이 없으며, 데이터의 근본적 한계(예측변수-결과 간 연관성 부족)가 주된 원인입니다. 의사결정나무 시각화 결과에서도 복잡한 규칙 없이 무작위 분할이 관찰됩니다. 해결을 위해서는 더 의미 있는 예측변수 수집과 전문가 피드백이 필수적입니다.
This post is licensed under CC BY 4.0 by the author.