PDAC Prediction with a Urine Biomarker Penal
Yujie (Janet) He
Yujie (Janet) He
SUPPLEMENTARY MATERIALS
The prior and full-conditional distribution are not conjugated. Therefore, we utilized the Metropolis–Hasting algorithm to update the parameters β, which involves the following steps:
bern.logit.reg.mcmc <- function(y,X,betamn,betavar,beta.tune,n.mcmc){
# Libraries and Subroutines
logit <- function(theta){
log(theta/(1-theta))
}
logit.inv <- function(z){
exp(z)/(1+exp(z))
}
# Preliminary Variables
n.burn = round(n.mcmc/10)
X = as.matrix(X)
y = as.vector(y)
n = length(y)
p = dim(X)[2]
betasave=matrix(0,p,n.mcmc)
Davgsave=rep(0,n.mcmc)
# Starting Values
beta=betamn
theta=logit.inv(X%*%beta)
accept=0
Sig2 = betavar*betavar*diag(7)
# Gibbs Loop
for(k in 2:n.mcmc){
if(k%%1000==0) cat(k," ")
# Sample beta
library(mvtnorm)
beta.star=rnorm(p,beta,beta.tune)
theta.star=logit.inv(X%*%beta.star)
mh1=sum(dbinom(y,1,theta.star,log=TRUE)) + dmvnorm(beta.star, betamn, Sig2, log = TRUE)
mh2=sum(dbinom(y,1,theta,log=TRUE))+dmvnorm(beta, betamn, Sig2, log = TRUE)
mh=exp(mh1-mh2)
if(mh > runif(1)){
beta=beta.star
theta=theta.star
accept = accept +1
}
Davgsave[k]=-2*(sum(dbinom(y,1,theta,log=TRUE)))
### Save Samples
betasave[,k]=beta
}
cat("\n")
# Calculate DIC and AIC
postbetamn=apply(betasave[,-(1:n.burn)],1,mean)
print("Posterior Mean for Beta:")
print(postbetamn)
Dhat=-2*(sum(dbinom(y,1,logit.inv(X%*%postbetamn),log=TRUE)))
Davg=mean(Davgsave[-(1:n.burn)])
pD=Davg-Dhat
DIC=2*Davg-Dhat
AIC=Davg+2*ncol(X)
cat("Dhat:",Dhat,"Davg:",Davg,"pD:",pD,"DIC:",DIC,"\n", 'Acceptance:',accept/n.mcmc, '\n')
# Write output
list(y=y,X=X,n.mcmc=n.mcmc,betasave=betasave,postbetamn=postbetamn,acceptance = accept/n.mcmc, DIC = DIC, AIC = AIC)
}
library(dplyr)
library(caret)
library(ggplot2)
library(GGally)
library(sjPlot)
library(interplot)
library(pROC)
library(scales)
set.seed(2788)
##### Logit ######
# Import dataset
data <- read.csv("pancreatic.cancer.csv", header = TRUE)
data <- filter(data, sample_origin != 'UCL')
# Binarize diagnosis and sex
data$sex <- ifelse(data$sex == 'F', 1, 0)
data$diagnosis <- ifelse(data$diagnosis == '3', 1, 0)
data_cancer = filter(data, diagnosis == 1)
data_other = filter(data, diagnosis == 0)
# Turn the control variables into factors
data$patient_cohort <- as.factor(data$patient_cohort)
data$sample_origin <- as.factor(data$sample_origin)
# Split
cancer_ind <- sample(seq_len(nrow(data_cancer)), size = floor(0.9*nrow(data_cancer)))
other_ind <- sample(seq_len(nrow(data_other)), size = floor(0.9*nrow(data_other)))
data_anal <- rbind(data_cancer[cancer_ind,], data_other[other_ind,])
data_test <- rbind(data_cancer[-cancer_ind,], data_other[-other_ind,])
# Create Variables and Scale Covariates. check the correlations.
ggpairs(as.data.frame(as.matrix(scale(data_anal[,10:13]))))
# Logit with interaction of all the variables with a corr > 0.5 or < -0.5
logit_interaction <- glm(diagnosis ~ patient_cohort + sample_origin +
age + sex + creatinine + LYVE1 + REG1B + TFF1 + TFF1*REG1B,
data = data_anal, family = "binomial"(link="logit"))
summary(logit_interaction)
# Conditional interaction effect plots
plot1 <- interplot(logit_interaction, var1 = "TFF1", var2 = "REG1B") +
xlab("REG1B") +
ylab("TFF1") +
ggtitle("The effect of TFF1 level on pancreatic cancer
probability conditional on REG1B level")+
theme(plot.title = element_text(size = 10, hjust = 0.5))
plot2 <- interplot(logit_interaction, var1 = "REG1B", var2 = "TFF1") +
xlab("TFF1") +
ylab("REG1B") +
ggtitle("The effect of REG1B level on pancreatic cancer
probability conditional on TFF1 level")+
theme(plot.title = element_text(size = 10,hjust = 0.5))
grid.arrange(plot1, plot2, ncol = 2)
# Marginal effects
plot_model(logit_interaction, type = "pred", terms=c("REG1B", "sample_origin[BPTB, ESP, LIV]"))
plot_model(logit_interaction, type = "pred", terms=c("TFF1", "sample_origin[BPTB, ESP, LIV]"))
plot_model(logit_interaction, type = "pred", terms=c("creatinine", "sample_origin[BPTB, ESP, LIV]"))
plot_model(logit_interaction, type = "pred", terms=c("LYVE1", "sample_origin[BPTB, ESP, LIV]"))
plot_model(logit_interaction, type = "pred", terms=c("REG1B", "patient_cohort[Cohort1, Cohort2]"))
plot_model(logit_interaction, type = "pred", terms=c("TFF1", "patient_cohort[Cohort1, Cohort2]"))
plot_model(logit_interaction, type = "pred", terms=c("creatinine", "patient_cohort[Cohort1, Cohort2]"))
plot_model(logit_interaction, type = "pred", terms=c("LYVE1", "patient_cohort[Cohort1, Cohort2]"))
plot_model(logit_interaction, type = "pred", terms=c("REG1B", "sex[0, 1]"))
plot_model(logit_interaction, type = "pred", terms=c("TFF1", "sex[0,1]"))
plot_model(logit_interaction, type = "pred", terms=c("creatinine", "sex[0,1]"))
plot_model(logit_interaction, type = "pred", terms=c("LYVE1", "sex[0,1]"))
##### Bayesian #####
# Create Variables and Scale Covariates
y = data_anal$diagnosis
n = length(y)
X = cbind(rep(1,n), scale(data_anal$age), data_anal$sex, as.matrix((scale(data_anal[,10:13]))))
# Fit the Model
source("bern.logit.reg.mcmc.wo.corr.R")
out=bern.logit.reg.mcmc(y,X,rep(0,dim(X)[2]),1.5,.1,100000)
par(mfrow = c(1,3))
matplot(t(out$betasave[1:2,]),type="l",lty=1,col = alpha(1:2,.2),ylab = 'beta')
legend('topleft',c('beta_0','beta_1'),
col = 1:2, fill = 1:2,cex=.5)
matplot(t(out$betasave[3:4,]),type="l",lty=1,col = alpha(3:4,.2),ylab='beta')
legend('topleft',c('beta_2','beta_3'),
col = 3:4, fill = 3:4,cex=.5)
matplot(t(out$betasave[5:7,]),type="l",lty=1,col = alpha(5:7,.2),ylab='beta')
legend('topleft',c('beta_4','beta_5','beta_6'),
col = 5:7, fill = 5:7,cex=.5)
# Prediction
# Scale the test dataset
X.mean = c(mean(data_anal$age), mean(data_anal$sex), mean(data_anal[,10]),
mean(data_anal[,11]),mean(data_anal[,12]),mean(data_anal[,13]))
X.sd = c(sd(data_anal$age), sd(data_anal$sex), sd(data_anal[,10]),
sd(data_anal[,11]),sd(data_anal[,12]),sd(data_anal[,13]))
y.test = data_test$diagnosis
X.test = cbind(data_test$age, data_test$sex, as.matrix(data_test[10:13]))
X.test = (X.test - t(replicate(length(y.test),X.mean)))/t(replicate(length(y.test),X.sd))
X.test = cbind(rep(1,length(y.test)),X.test)
# Predict y
logit.inv <- function(z){
exp(z)/(1+exp(z))
}
theta.pred = logit.inv(as.matrix(X.test)%*%out$postbetamn)
y.pred = c()
for (theta in theta.pred){
y.pred = cbind(y.pred, rbinom(1,1,theta))
}
# Plot
layout(1)
plot(1:length(y.test),y.pred,col='red', main = 'Prediction accuracy', ylab = 'Benign vs. Cancer',xlab = 'Patient #')
points(1:length(y.test),y.test, pch = 4, col = 'blue')
legend('center', legend=c("Prediction", "Diagnosis"),
col=c("red", "blue"), pch = c(1,4),cex=0.8)
# Analysis
hit = 0
for (i in 1: length(y.test)){
if(y.test[i]==y.pred[i]){
hit = hit+1
}
}
cat("Accuracy:", hit/length(y.test))
# Test the model with the data_test: Bayesian
# ROC
bayesian.pred <- mapply(t(theta.pred), FUN=as.numeric)
roc_curve <- roc(data_test$diagnosis, bayesian.pred)
plot5 <- ggroc(roc_curve,color ='blue', size = 3) +
ggtitle(paste0('ROC Curve_Bayesian', '(AUC = ', round(auc(data_test$diagnosis, bayesian.pred),4), ')'))
# Confusion matrix
confusion_matrix_Bayesian <- table(Predicted = y.pred, Actual = y.test)
confusion_Bayesian_df <- as.data.frame(confusion_matrix_Bayesian)
plot3 <- ggplot(confusion_Bayesian_df, aes(x = Actual, y = Predicted, fill = Freq)) +
geom_tile() +
geom_text(aes(label = Freq), color = "black", size = 5) +
scale_fill_gradient(low = "white", high = "blue") +
labs(title = "Confusion Matrix of Bayesian estimation", x = "Actual", y = "Predicted", fill = 'Count') +
theme_minimal()
########
# Test the model with the data_test: MLE
predicted_probs <- predict(logit_interaction, newdata = data_test, type = "response")
predicted_classes <- ifelse(predicted_probs > 0.5, 1, 0)
confusion_matrix <- table(Predicted = predicted_classes, Actual = data_test$diagnosis)
# ROC
roc_curve <- roc(data_test$diagnosis, predicted_probs)
plot6<-ggroc(roc_curve,color ='red', size = 3) +
ggtitle(paste0('ROC Curve_MLE', '(AUC = ', round(auc(data_test$diagnosis, predicted_probs),4), ')'))
# Confusion matrix
confusion_df <- as.data.frame(confusion_matrix)
plot4 <-ggplot(confusion_df, aes(x = Actual, y = Predicted, fill = Freq)) +
geom_tile() +
geom_text(aes(label = Freq), color = "black", size = 5) +
scale_fill_gradient(low = "white", high = "red") +
labs(title = "Confusion Matrix of MLE estimation", x = "Actual", y = "Predicted", fill = 'Count') +
theme_minimal()
grid.arrange(plot4, plot3, ncol=2)
grid.arrange(plot6, plot5, ncol=2)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay, roc_curve, auc
from sklearn.preprocessing import StandardScaler
import pandas as pd
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
from lightgbm import LGBMClassifier
from tabpfn import TabPFNClassifier
data = pd.read_csv("MLE_finalproj/pancreatic.cancer.csv")
# Preprocess
data = data.drop(columns = ['sample_id','stage','benign_sample_diagnosis','plasma_CA19_9','REG1A'])
data = data[data['sample_origin']!='UCL']
data['sex'] = data['sex'].apply(lambda x: 0 if x == 'M' else 1)
data['patient_cohort'] = data['patient_cohort'].apply(lambda x: 1 if x == 'Cohort1' else 2)
data['sample_origin'] = data['sample_origin'].map({'BPTB':0, 'LIV':1, 'ESP': 2})
data['diagnosis'] = data['diagnosis'].apply(lambda x: 0 if x < 3 else 1)
data
# Split the data
x = data.drop("diagnosis", axis=1)
y = data["diagnosis"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=58/570, random_state=200)
# XGBoost
model_xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss') # 'use_label_encoder=False' suppresses a warning
model_xgb.fit(x_train, y_train)
y_pred_xgb = model.predict(x_test)
# Confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_test,y_pred_xgb))
disp.plot(cmap=plt.cm.RdPu)
plt.title("Confusion Matrix of XGBoost")
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.savefig('cm_xgb.png')
# lgbm
model_lgbm = LGBMClassifier()
model_lgbm.fit(x_train, y_train)
y_pred_lgbm = model_lgbm.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_pred_lgbm))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lgbm))
print("Classification Report:\n", classification_report(y_test, y_pred_lgbm))
# Confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_test,y_pred_lgbm))
disp.plot(cmap=plt.cm.RdPu)
plt.title("Confusion Matrix of LightGBM")
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.savefig('cm_lgbm.png')
# tabpfn
model_tabpfn = TabPFNClassifier()
model_tabpfn.fit(x_train, y_train)
y_pred_tabpfn = model_tabpfn.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_pred_tabpfn))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_tabpfn))
print("Classification Report:\n", classification_report(y_test, y_pred_tabpfn))
# Confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_test,y_pred_tabpfn))
disp.plot(cmap=plt.cm.RdPu)
plt.title("Confusion Matrix of TabPFN")
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.savefig('cm_tabpfn.png')
# roc
models = {
"XGBoost": model_xgb,
"TabPFN": model_tabpfn,
"LightGBM": model_lgbm
}
plt.figure(figsize=(6, 6))
for name, model in models.items():
y_probs = model.predict_proba(x_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_probs)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr,linewidth = 3, label=f'{name} (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--', label='Chance',linewidth = 3)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison')
plt.legend(loc='lower right')
plt.grid()
plt.savefig('roc.png')