PDAC Prediction with a Urine Biomarker Penal

Yujie (Janet) He

SUPPLEMENTARY MATERIALS

MCMC Algorithm

Method

Posterior and full conditional distribution

MCMC algorithm for logistic regression

R Script

R Script for Logit Model Fitting

Python script for ML models training

MCMC Algorithm

Method

Posterior and full conditional distribution

MCMC algorithm for logistic regression

The prior and full-conditional distribution are not conjugated. Therefore, we utilized the Metropolis–Hasting algorithm to update the parameters β, which involves the following steps:

R Script

bern.logit.reg.mcmc <- function(y,X,betamn,betavar,beta.tune,n.mcmc){

# Libraries and Subroutines

logit <- function(theta){

log(theta/(1-theta))

}

logit.inv <- function(z){

exp(z)/(1+exp(z))

}

# Preliminary Variables

n.burn = round(n.mcmc/10)

X = as.matrix(X)

y = as.vector(y)

n = length(y)

p = dim(X)[2]

betasave=matrix(0,p,n.mcmc)

Davgsave=rep(0,n.mcmc)

# Starting Values

beta=betamn

theta=logit.inv(X%*%beta)

accept=0

Sig2 = betavar*betavar*diag(7)

# Gibbs Loop

for(k in 2:n.mcmc){

if(k%%1000==0) cat(k," ")

# Sample beta

library(mvtnorm)

beta.star=rnorm(p,beta,beta.tune)

theta.star=logit.inv(X%*%beta.star)

mh1=sum(dbinom(y,1,theta.star,log=TRUE)) + dmvnorm(beta.star, betamn, Sig2, log = TRUE)

mh2=sum(dbinom(y,1,theta,log=TRUE))+dmvnorm(beta, betamn, Sig2, log = TRUE)

mh=exp(mh1-mh2)

if(mh > runif(1)){

beta=beta.star

theta=theta.star

accept = accept +1

}

Davgsave[k]=-2*(sum(dbinom(y,1,theta,log=TRUE)))

### Save Samples

betasave[,k]=beta

}

cat("\n")

# Calculate DIC and AIC

postbetamn=apply(betasave[,-(1:n.burn)],1,mean)

print("Posterior Mean for Beta:")

print(postbetamn)

Dhat=-2*(sum(dbinom(y,1,logit.inv(X%*%postbetamn),log=TRUE)))

Davg=mean(Davgsave[-(1:n.burn)])

pD=Davg-Dhat

DIC=2*Davg-Dhat

AIC=Davg+2*ncol(X)

cat("Dhat:",Dhat,"Davg:",Davg,"pD:",pD,"DIC:",DIC,"\n", 'Acceptance:',accept/n.mcmc, '\n')

# Write output

list(y=y,X=X,n.mcmc=n.mcmc,betasave=betasave,postbetamn=postbetamn,acceptance = accept/n.mcmc, DIC = DIC, AIC = AIC)

}

R Script for Logit Model Fitting

library(dplyr)

library(caret)

library(ggplot2)

library(GGally)

library(sjPlot)

library(interplot)

library(pROC)

library(scales)

set.seed(2788)

##### Logit ######

# Import dataset

data <- read.csv("pancreatic.cancer.csv", header = TRUE)

data <- filter(data, sample_origin != 'UCL')

# Binarize diagnosis and sex

data$sex <- ifelse(data$sex == 'F', 1, 0)

data$diagnosis <- ifelse(data$diagnosis == '3', 1, 0)

data_cancer = filter(data, diagnosis == 1)

data_other = filter(data, diagnosis == 0)

# Turn the control variables into factors

data$patient_cohort <- as.factor(data$patient_cohort)

data$sample_origin <- as.factor(data$sample_origin)

# Split

cancer_ind <- sample(seq_len(nrow(data_cancer)), size = floor(0.9*nrow(data_cancer)))

other_ind <- sample(seq_len(nrow(data_other)), size = floor(0.9*nrow(data_other)))

data_anal <- rbind(data_cancer[cancer_ind,], data_other[other_ind,])

data_test <- rbind(data_cancer[-cancer_ind,], data_other[-other_ind,])

# Create Variables and Scale Covariates. check the correlations.

ggpairs(as.data.frame(as.matrix(scale(data_anal[,10:13]))))

# Logit with interaction of all the variables with a corr > 0.5 or < -0.5

logit_interaction <- glm(diagnosis ~ patient_cohort + sample_origin +

age + sex + creatinine + LYVE1 + REG1B + TFF1 + TFF1*REG1B,

data = data_anal, family = "binomial"(link="logit"))

summary(logit_interaction)

# Conditional interaction effect plots

plot1 <- interplot(logit_interaction, var1 = "TFF1", var2 = "REG1B") +

xlab("REG1B") +

ylab("TFF1") +

ggtitle("The effect of TFF1 level on pancreatic cancer

probability conditional on REG1B level")+

theme(plot.title = element_text(size = 10, hjust = 0.5))

plot2 <- interplot(logit_interaction, var1 = "REG1B", var2 = "TFF1") +

xlab("TFF1") +

ylab("REG1B") +

ggtitle("The effect of REG1B level on pancreatic cancer

probability conditional on TFF1 level")+

theme(plot.title = element_text(size = 10,hjust = 0.5))

grid.arrange(plot1, plot2, ncol = 2)

# Marginal effects

plot_model(logit_interaction, type = "pred", terms=c("REG1B", "sample_origin[BPTB, ESP, LIV]"))

plot_model(logit_interaction, type = "pred", terms=c("TFF1", "sample_origin[BPTB, ESP, LIV]"))

plot_model(logit_interaction, type = "pred", terms=c("creatinine", "sample_origin[BPTB, ESP, LIV]"))

plot_model(logit_interaction, type = "pred", terms=c("LYVE1", "sample_origin[BPTB, ESP, LIV]"))

plot_model(logit_interaction, type = "pred", terms=c("REG1B", "patient_cohort[Cohort1, Cohort2]"))

plot_model(logit_interaction, type = "pred", terms=c("TFF1", "patient_cohort[Cohort1, Cohort2]"))

plot_model(logit_interaction, type = "pred", terms=c("creatinine", "patient_cohort[Cohort1, Cohort2]"))

plot_model(logit_interaction, type = "pred", terms=c("LYVE1", "patient_cohort[Cohort1, Cohort2]"))

plot_model(logit_interaction, type = "pred", terms=c("REG1B", "sex[0, 1]"))

plot_model(logit_interaction, type = "pred", terms=c("TFF1", "sex[0,1]"))

plot_model(logit_interaction, type = "pred", terms=c("creatinine", "sex[0,1]"))

plot_model(logit_interaction, type = "pred", terms=c("LYVE1", "sex[0,1]"))

##### Bayesian #####

# Create Variables and Scale Covariates

y = data_anal$diagnosis

n = length(y)

X = cbind(rep(1,n), scale(data_anal$age), data_anal$sex, as.matrix((scale(data_anal[,10:13]))))

# Fit the Model

source("bern.logit.reg.mcmc.wo.corr.R")

out=bern.logit.reg.mcmc(y,X,rep(0,dim(X)[2]),1.5,.1,100000)

par(mfrow = c(1,3))

matplot(t(out$betasave[1:2,]),type="l",lty=1,col = alpha(1:2,.2),ylab = 'beta')

legend('topleft',c('beta_0','beta_1'),

col = 1:2, fill = 1:2,cex=.5)

matplot(t(out$betasave[3:4,]),type="l",lty=1,col = alpha(3:4,.2),ylab='beta')

legend('topleft',c('beta_2','beta_3'),

col = 3:4, fill = 3:4,cex=.5)

matplot(t(out$betasave[5:7,]),type="l",lty=1,col = alpha(5:7,.2),ylab='beta')

legend('topleft',c('beta_4','beta_5','beta_6'),

col = 5:7, fill = 5:7,cex=.5)

# Prediction

# Scale the test dataset

X.mean = c(mean(data_anal$age), mean(data_anal$sex), mean(data_anal[,10]),

mean(data_anal[,11]),mean(data_anal[,12]),mean(data_anal[,13]))

X.sd = c(sd(data_anal$age), sd(data_anal$sex), sd(data_anal[,10]),

sd(data_anal[,11]),sd(data_anal[,12]),sd(data_anal[,13]))

y.test = data_test$diagnosis

X.test = cbind(data_test$age, data_test$sex, as.matrix(data_test[10:13]))

X.test = (X.test - t(replicate(length(y.test),X.mean)))/t(replicate(length(y.test),X.sd))

X.test = cbind(rep(1,length(y.test)),X.test)

# Predict y

logit.inv <- function(z){

exp(z)/(1+exp(z))

}

theta.pred = logit.inv(as.matrix(X.test)%*%out$postbetamn)

y.pred = c()

for (theta in theta.pred){

y.pred = cbind(y.pred, rbinom(1,1,theta))

}

# Plot

layout(1)

plot(1:length(y.test),y.pred,col='red', main = 'Prediction accuracy', ylab = 'Benign vs. Cancer',xlab = 'Patient #')

points(1:length(y.test),y.test, pch = 4, col = 'blue')

legend('center', legend=c("Prediction", "Diagnosis"),

col=c("red", "blue"), pch = c(1,4),cex=0.8)

# Analysis

hit = 0

for (i in 1: length(y.test)){

if(y.test[i]==y.pred[i]){

hit = hit+1

}

cat("Accuracy:", hit/length(y.test))

# Test the model with the data_test: Bayesian

# ROC

bayesian.pred <- mapply(t(theta.pred), FUN=as.numeric)

roc_curve <- roc(data_test$diagnosis, bayesian.pred)

plot5 <- ggroc(roc_curve,color ='blue', size = 3) +

ggtitle(paste0('ROC Curve_Bayesian', '(AUC = ', round(auc(data_test$diagnosis, bayesian.pred),4), ')'))

# Confusion matrix

confusion_matrix_Bayesian <- table(Predicted = y.pred, Actual = y.test)

confusion_Bayesian_df <- as.data.frame(confusion_matrix_Bayesian)

plot3 <- ggplot(confusion_Bayesian_df, aes(x = Actual, y = Predicted, fill = Freq)) +

geom_tile() +

geom_text(aes(label = Freq), color = "black", size = 5) +

scale_fill_gradient(low = "white", high = "blue") +

labs(title = "Confusion Matrix of Bayesian estimation", x = "Actual", y = "Predicted", fill = 'Count') +

theme_minimal()

########

# Test the model with the data_test: MLE

predicted_probs <- predict(logit_interaction, newdata = data_test, type = "response")

predicted_classes <- ifelse(predicted_probs > 0.5, 1, 0)

confusion_matrix <- table(Predicted = predicted_classes, Actual = data_test$diagnosis)

# ROC

roc_curve <- roc(data_test$diagnosis, predicted_probs)

plot6<-ggroc(roc_curve,color ='red', size = 3) +

ggtitle(paste0('ROC Curve_MLE', '(AUC = ', round(auc(data_test$diagnosis, predicted_probs),4), ')'))

# Confusion matrix

confusion_df <- as.data.frame(confusion_matrix)

plot4 <-ggplot(confusion_df, aes(x = Actual, y = Predicted, fill = Freq)) +

geom_tile() +

geom_text(aes(label = Freq), color = "black", size = 5) +

scale_fill_gradient(low = "white", high = "red") +

labs(title = "Confusion Matrix of MLE estimation", x = "Actual", y = "Predicted", fill = 'Count') +

theme_minimal()

grid.arrange(plot4, plot3, ncol=2)

grid.arrange(plot6, plot5, ncol=2)

Python script for ML models training

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay, roc_curve, auc

from sklearn.preprocessing import StandardScaler

import pandas as pd

from xgboost import XGBClassifier

import matplotlib.pyplot as plt

from lightgbm import LGBMClassifier

from tabpfn import TabPFNClassifier

data = pd.read_csv("MLE_finalproj/pancreatic.cancer.csv")

# Preprocess

data = data.drop(columns = ['sample_id','stage','benign_sample_diagnosis','plasma_CA19_9','REG1A'])

data = data[data['sample_origin']!='UCL']

data['sex'] = data['sex'].apply(lambda x: 0 if x == 'M' else 1)

data['patient_cohort'] = data['patient_cohort'].apply(lambda x: 1 if x == 'Cohort1' else 2)

data['sample_origin'] = data['sample_origin'].map({'BPTB':0, 'LIV':1, 'ESP': 2})

data['diagnosis'] = data['diagnosis'].apply(lambda x: 0 if x < 3 else 1)

data

# Split the data

x = data.drop("diagnosis", axis=1)

y = data["diagnosis"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=58/570, random_state=200)

# XGBoost

model_xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss') # 'use_label_encoder=False' suppresses a warning

model_xgb.fit(x_train, y_train)

y_pred_xgb = model.predict(x_test)

# Confusion matrix

disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_test,y_pred_xgb))

disp.plot(cmap=plt.cm.RdPu)

plt.title("Confusion Matrix of XGBoost")

plt.xlabel('Predicted')

plt.ylabel('Actual')

plt.savefig('cm_xgb.png')

# lgbm

model_lgbm = LGBMClassifier()

model_lgbm.fit(x_train, y_train)

y_pred_lgbm = model_lgbm.predict(x_test)

print("Accuracy:", accuracy_score(y_test, y_pred_lgbm))

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lgbm))

print("Classification Report:\n", classification_report(y_test, y_pred_lgbm))

# Confusion matrix

disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_test,y_pred_lgbm))

disp.plot(cmap=plt.cm.RdPu)

plt.title("Confusion Matrix of LightGBM")

plt.xlabel('Predicted')

plt.ylabel('Actual')

plt.savefig('cm_lgbm.png')

# tabpfn

model_tabpfn = TabPFNClassifier()

model_tabpfn.fit(x_train, y_train)

y_pred_tabpfn = model_tabpfn.predict(x_test)

print("Accuracy:", accuracy_score(y_test, y_pred_tabpfn))

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_tabpfn))

print("Classification Report:\n", classification_report(y_test, y_pred_tabpfn))

# Confusion matrix

disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_test,y_pred_tabpfn))

disp.plot(cmap=plt.cm.RdPu)

plt.title("Confusion Matrix of TabPFN")

plt.xlabel('Predicted')

plt.ylabel('Actual')

plt.savefig('cm_tabpfn.png')

# roc

models = {

"XGBoost": model_xgb,

"TabPFN": model_tabpfn,

"LightGBM": model_lgbm

}

plt.figure(figsize=(6, 6))

for name, model in models.items():

y_probs = model.predict_proba(x_test)[:, 1]

fpr, tpr, _ = roc_curve(y_test, y_probs)

roc_auc = auc(fpr, tpr)

plt.plot(fpr, tpr,linewidth = 3, label=f'{name} (AUC = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], 'k--', label='Chance',linewidth = 3)

plt.xlabel('False Positive Rate')

plt.ylabel('True Positive Rate')

plt.title('ROC Curve Comparison')

plt.legend(loc='lower right')

plt.grid()

plt.savefig('roc.png')

Page updated

Google Sites

Report abuse