Introduction
# Welcome to the 'steroid data analysis' webpage!
# The procedures and explanations to make all the analysis and plots are in their individual chapters below.
# These methods could be also easily applied to other types of data sets and metabolites than 'steroids' and their respective metadata per se.
# In addition, there is a small 'disclaimer' also at the end of this webpage to emphasize that this site is mainly for educational purposes.
# Please let me know if you have any questions. For that, use the 'following' email: patati at the university of Turku
Loading Required R Packages
# Set library paths if needed
# .libPaths(c("C:/Program Files/R/R-4.4.1/library", .libPaths()))
# Fyi: adding echo=FALSE here (at the R markdown header) does not show this code block.
# List of libraries to load (alphabetically sorted)
packages <- c("bigsnpr", "binilib", "brickster", "car", "censReg", "circlize", "ComplexHeatmap", "correlation",
"corrplot", "daiR", "datarium", "dmetar", "dplyr", "effsize", "extrafont", "forcats", "fs", "FSA",
"ggcorrplot", "ggforce", "ggforestplot", "ggplot2", "ggplotify", "ggpubr", "ggsankey", "ggsankeyfier",
"ggh4x", "ggtext", "glmnet", "grid", "Hmisc", "hrbrthemes", "igraph", "insight", "lavaan", "lmtest",
"lme4", "lsr", "magick", "magrittr", "Maaslin2", "mdatools", "mediation", "meta", "mgcv", "mlma",
"MOFA2", "pheatmap", "PerformanceAnalytics", "pathviewr", "plyr", "plotrix", "ppcor", "prettydoc",
"psych", "quantreg", "qpgraph", "ragg", "RColorBrewer", "rcompanion", "readxl", "remotes", "reshape2",
"rgl", "rmarkdown", "rmdformats", "rstatix", "scales", "scater", "scatterplot3d", "sjPlot", "stringr",
"superb", "tibble", "tidyverse", "tint", "tufte", "viridis","WGCNA", "xlsx")
# Load all libraries
.libPaths(c("C:/Program Files/R/R-4.4.1/library", .libPaths()))
invisible(lapply(packages, library, character.only = TRUE))
# Note: Do not load 'forestplot' as it conflicts with 'ggforestplot'
# You may need to load both rmarkdown and rmd
# Install packages if not already installed
# renv::install() # Installs from the basic R repository
# if (!require("BiocManager", quietly = TRUE)) install.packages("BiocManager")
# BiocManager::install(c("ComplexHeatmap", "DESeq2", "dmetar", "fgsea", "ggforestplot", "ggsankey", "limma", "Maaslin2", "metagenomeSeq", "MOFA2", "qpgraph", "scater", "scRNAseq", "sevenbridges"))
# remotes::install_github(c("davidsjoberg/ggsankey", "fossbert/binilib", "MathiasHarrer/dmetar", "mattflor/chorddiag", "NightingaleHealth/ggforestplot"))
# devtools::install_github("mattflor/chorddiag") # Alternative installation method
Importing Data and Metadata
# First set your data folder:
# setwd("C:/Users/patati/Documents/GitHub/Steroid_Data_Analysis")
# or:"C:/Users/patati/Desktop/Turku/R" #check the wd with: here::here() #or getwd()
# load("thereal_v2.RData") #This is so to say real data, and it is not available here (at the site).
# It is easier to load the ready stiched data in one go with .RData file than one by one as below, but
# for educational purposes I put some examples what may need to be done to get your data in ok form
# for later purposes
# setwd("C:/Users/patati/Desktop/Turku/R")
NonAlcoholicFattyLiverDisease=read_excel("NAFLD_SteroidStudy.xlsx",sheet = "LFAT_steroidsDATA") # This is partly auxiliary
columnNames=colnames(NonAlcoholicFattyLiverDisease); NonAlcoholicFattyLiverDisease=data.frame(NonAlcoholicFattyLiverDisease)
#The names of the steroid steroidGroups need to be imported early on:
steroidGroups=read.csv("groups_17823.csv", header = TRUE, sep=";")
steroidGroups=steroidGroups[,c('Group','Abbreviation')]
steroidGroups=steroidGroups[steroidGroups[,'Abbreviation']!='F',]
steroidGroups=steroidGroups[order(steroidGroups[,'Group']),]
steroidGroups[,'Abbreviation'][steroidGroups[,'Abbreviation']=='17aOH-P4']='17a-OHP4'
groupValues=steroidGroups
steroidNames=read_excel("NAFLD_SteroidStudy_for groups.xlsx",sheet = "Steroid name abbreviations")
# This is partly auxiliary
groups2=data.frame(steroidNames)[,1:4]; g1=read.csv("groups_17823.csv", header = TRUE, sep=";")
groups2=cbind(g1[,'Group'], groups2[,c('Abbreviation','Abbreviation_old','Name')])
groups2=groups2[groups2[,'Abbreviation']!='FF',]; colnames(groups2)[1]="Group";
groups2=groups2[order(groups2[,'Group']),]
groups2[,'Abbreviation'][groups2[,'Abbreviation']=='17aOH-P4']='17a-OHP4'
#P4 was found from elsewhere to have the following characteristics:
NonAlcoholicFattyLiverDisease[,'P4'] = as.numeric(NonAlcoholicFattyLiverDisease[,'P4'])
NonAlcoholicFattyLiverDisease[,'P4'][is.na(NonAlcoholicFattyLiverDisease[,'P4'])] = 22557.3330346846
# median(NonAlcoholicFattyLiverDisease[,'P4'], na.rm=TRUE)
NonAlcoholicFattyLiverDisease[,5:7][NonAlcoholicFattyLiverDisease[,5:7]==0.01]=0; colnames(NonAlcoholicFattyLiverDisease)=columnNames
MetabolicAssociatedLiverDisease=read_excel("Combined.Matrix.For.Pauli.2023.10.17.Excel.Formatv2.xlsx") # This is the main file
columnNames=colnames(MetabolicAssociatedLiverDisease); MetabolicAssociatedLiverDisease=data.frame(MetabolicAssociatedLiverDisease); colnames(MetabolicAssociatedLiverDisease)=columnNames
# All kinds of tricks are needed for getting the right data format
rownames(MetabolicAssociatedLiverDisease)=MetabolicAssociatedLiverDisease[,1]
MetabolicAssociatedLiverDisease[,'P4'] = as.numeric(MetabolicAssociatedLiverDisease[,'P4'])
#The same comment as above
MetabolicAssociatedLiverDisease[,'P4'][is.na(MetabolicAssociatedLiverDisease[,'P4'])] = 22557.3330346846
evaluationCriteria=c('Grade(0-3)', 'Stage(0-4)','Necroinflammation')
MetabolicAssociatedLiverDisease[,evaluationCriteria][MetabolicAssociatedLiverDisease[,evaluationCriteria]==0.01]=0;
targetData=c('11-KDHT','AN','DHT','17a-OHP5','E2','P5','DOC')
valueList=c(103,252,51,200,26.5,253,10); valueListAdjusted=c(100,250,50,200,25,250,10)
for (i in 1:7) {MetabolicAssociatedLiverDisease[,targetData][i][MetabolicAssociatedLiverDisease[,targetData][i]==valueList[i]]=valueListAdjusted[i]}
# These (E) are ok as per lab:
menopauseMarkers=read.csv('E_tikka231023.csv',header=TRUE, sep=";")
menopauseMarkersPatients=rownames(MetabolicAssociatedLiverDisease[MetabolicAssociatedLiverDisease[,'E']==106000,])
patientNumbers=menopauseMarkers[which(menopauseMarkers[,1] %in% menopauseMarkersPatients),'patient.number']
markerValues=menopauseMarkers[which(menopauseMarkers[,1] %in% menopauseMarkersPatients),'E']
MetabolicAssociatedLiverDisease[as.character(patientNumbers),'E']=markerValues
# These (11-KA4) will perhaps change in the lab (sometime after 24.10.23):
marker11KA4=read.csv('11KA4_tikka231023.csv',header=TRUE, sep=";")
# marker11KA4[,1][c(1:5,9)];MetabolicAssociatedLiverDisease[as.character(marker11KA4[,1][c(1:5,9)]),'11-KA4']
#These were denoted with 'big interference'
MetabolicAssociatedLiverDisease[as.character(marker11KA4[,1][c(1:5,9)]),'11-KA4'] = NA
#Alternatively: median(MetabolicAssociatedLiverDisease[!rownames(MetabolicAssociatedLiverDisease) %in% as.character(marker11KA4[,1][c(1:5,9)]),'11-KA4'])
BMI_ordered_MASLD=MetabolicAssociatedLiverDisease[order(MetabolicAssociatedLiverDisease[,'BMI']),'BMI']
BMI_ordered_NAFLD=NonAlcoholicFattyLiverDisease[order(NonAlcoholicFattyLiverDisease[,'BMI']),'BMI']
uniqueBMIValues=unique(BMI_ordered_NAFLD[! BMI_ordered_NAFLD %in% BMI_ordered_MASLD])
NonAlcoholicFattyLiverDisease=NonAlcoholicFattyLiverDisease[order(NonAlcoholicFattyLiverDisease[,'BMI']),]
NonAlcoholicFattyLiverDisease=NonAlcoholicFattyLiverDisease[NonAlcoholicFattyLiverDisease[,'BMI']!=uniqueBMIValues,]
MetabolicAssociatedLiverDisease=MetabolicAssociatedLiverDisease[order(MetabolicAssociatedLiverDisease[,'BMI']),]
#https://appsilon.com/imputation-in-r/ #https://www.datasciencemadesimple.com/get-minimum-value-of-a-column-in-r-2/?expand_article=1
# New data import withouth changing the conames: https://readxl.tidyverse.org/articles/column-names.html
bileAcidsLiverData=data.frame(read_excel("Liver_bile_acids_PFAS.xlsx",sheet = "Liver_BA",.name_repair = "minimal")); row.names(bileAcidsLiverData)=bileAcidsLiverData[,1]
PFASSerumData=data.frame(read_excel("Liver_bile_acids_PFAS.xlsx",sheet = "PFAS_serum",.name_repair = "minimal")); rownames(PFASSerumData)=as.vector(unlist(PFASSerumData[,1]))
serumBileAcidsData=data.frame(read_excel("Liver_bile_acids_PFAS.xlsx",sheet = "Serum_BA",.name_repair = "minimal"));rownames(serumBileAcidsData)=as.vector(unlist(serumBileAcidsData[,1]))
C4Data=data.frame(read_excel("Liver_bile_acids_PFAS.xlsx",sheet = "C4",.name_repair = "minimal")); rownames(C4Data)=as.vector(unlist(C4Data[,1]))
clinicalData=data.frame(read_excel("Matching clinical data_all.xlsx",sheet = "Sheet1",.name_repair = "minimal")); rownames(clinicalData)=as.vector(unlist(clinicalData[,1]));
#https://www.analyticsvidhya.com/blog/2021/06/hypothesis-testing-parametric-and-non-parametric-tests-in-statistics/
MetabolicAssociatedLiverDisease[1:2,2:20] #or head(MetabolicAssociatedLiverDisease);
## E 11-KT 17a-OHP5 S B 11b-OHA4 11-KDHT
## 24145190413 47616.43 1013.1342 5023.952 754.5581 8580.997 13735.454 100
## 2459090909 44356.73 935.1387 1111.426 569.7927 6167.239 4709.126 100
## 11-KA4 DHEA T/Epi-T 17aOH-P4 AN DHT A4
## 24145190413 1833.115 12456.910 912.9862 1095.8051 250.0000 50.0000 3065.905
## 2459090909 1004.484 5890.108 10091.7241 864.5884 574.9827 471.7821 1683.676
## DOC P5 P4 A E2
## 24145190413 58.15483 2375.196 270.70746 516.4204 864.3368
## 2459090909 53.65145 1583.961 85.07423 752.5202 60.0817
# The below ordering needs to be changed...
bileAcidsLiverData=bileAcidsLiverData[as.character(MetabolicAssociatedLiverDisease$PatientNumber),];#bileAcidsLiverData[1:3,2:10] #https://stackoverflow.com/questions/54264980/r-how-to-set-row-names-attribute-as-numeric-from-character
# I did otherway around
serumBileAcidsData=serumBileAcidsData[as.character(MetabolicAssociatedLiverDisease$PatientNumber),];#serumBileAcidsData[1:3,2:10]
clinicalData=clinicalData[as.character(MetabolicAssociatedLiverDisease$PatientNumber),];#clinicalData[1:3,2:10]
# Many of these variables are irrelevant/not-used here... so not opening e.g. uniqueBMIValues, they would exhaust this file
C4Data=C4Data[as.character(MetabolicAssociatedLiverDisease$PatientNumber),];#C4Data[1:3,]
PFASSerumData=PFASSerumData[as.character(MetabolicAssociatedLiverDisease$PatientNumber),];#PFASSerumData[1:3,2:10]
# Menopause markers:
menopause=read_excel("Putative_metabolic_markers_menopause.xlsx",sheet='menopause markers',.name_repair = "minimal"); # rownames(clinicalData)=as.vector(unlist(clinicalData[,1]));
menopause=menopause[8:dim(menopause)[1],]; menopause=menopause[,-15]; menopause[2,2:14]=menopause[1,2:14]; menopause=data.frame(menopause); menopause[2,13:14]=c('v1','v2'); #dim(menopause)
colnames(menopause)=c('row_names',menopause[2,2:dim(menopause)[2]]); menopause=menopause[3:dim(menopause)[1],];rownames(menopause)=as.vector(unlist(menopause[,1]));
menopause=menopause[as.character(MetabolicAssociatedLiverDisease$PatientNumber),]
colnames(PFASSerumData)[colnames(PFASSerumData)=='PFHxA.1']='PFHxA_Branched'
PFASSerumData=PFASSerumData[,colnames(PFASSerumData)!='Benzylparaben.1']
PFASSerumData[PFASSerumData[,'Benzylparaben']>10,'Benzylparaben']=NA
Jeihou=data.frame(read_excel("Copy of BA_liverfat_RawData.xls",.name_repair = "minimal")); row.names(Jeihou)=Jeihou[,1];Jeihou=Jeihou[as.character(MetabolicAssociatedLiverDisease$PatientNumber),]
u=Jeihou[Jeihou[,'GHDGA']=='<LLOQ',1]; a=u[!is.na(u)]; b=rownames(bileAcidsLiverData[bileAcidsLiverData[,'GHDGA']==1,]);
uu=Jeihou[Jeihou[,'GHDGA']=='No Result',1]; aa=uu[!is.na(uu)];
bileAcidsLiverData[as.character(a),'GHDGA']=min(bileAcidsLiverData[,'GHDGA'],na.rm=TRUE)/2
AuxBileAcid=bileAcidsLiverData[bileAcidsLiverData[,'GHDGA']==1,1]
bileAcidsLiverData[as.character(AuxBileAcid),'GHDGA']=NA
#https://www.datasciencemadesimple.com/get-minimum-value-of-a-column-in-r-2/?expand_article=1
mat=bileAcidsLiverData[,c('TbMCA','ToMCA','TDCA','TDHCA','TLCA')]
mat[!mat>1]=10000
mat[mat==2]=10000 #Colmins did not work so I used (i.e. colmins did not work so I used):
hip=do.call(pmin, lapply(1:nrow(mat), function(i)mat[i,])) #https://stackoverflow.com/questions/13676878/fastest-way-to-get-min-from-every-column-in-a-matrix
hou=c('TbMCA','ToMCA','TDCA','TDHCA','TLCA')
for (i in 1:5) {bileAcidsLiverData[bileAcidsLiverData[,hou[i]]==1,hou[i]]=hip[i]}
for (i in 1:5) {bileAcidsLiverData[bileAcidsLiverData[,hou[i]]==2,hou[i]]=hip[i]}
# An imputation for the missing values:
C4Data[is.na(C4Data[,2]),2]=median(C4Data[!is.na(C4Data[,2]),2]) #assuming that these were not below quantitation and replacing with median
#https://www.geeksforgeeks.org/performing-logarithmic-computations-in-r-programming-log-log10-log1p-and-log2-functions/
#https://stackoverflow.com/questions/50476717/i-want-to-align-match-two-unequal-columns
#Matching two unequal columns.. match the names of one original column (dat2) to ones that are missing (dat1 with to other) #Not sure if this should be this difficult...
CombinedData=cbind(MetabolicAssociatedLiverDisease[,1],NonAlcoholicFattyLiverDisease[,2:7],clinicalData[,'HOMA.IR'],MetabolicAssociatedLiverDisease[,colnames(NonAlcoholicFattyLiverDisease[,8:27])],bileAcidsLiverData[,2:dim(bileAcidsLiverData)[2]], C4Data[,2:dim(C4Data)[2]],serumBileAcidsData[,2:dim(serumBileAcidsData)[2]],PFASSerumData[,(2:(dim(PFASSerumData)[2]))], MetabolicAssociatedLiverDisease[,'PFAS']);
colnames(CombinedData)[colnames(CombinedData)=='C4Data[, 2:dim(C4Data)[2]]']='C4Data';colnames(CombinedData)[colnames(CombinedData)=='clinicalData[, \"HOMA.IR\"]']='HOMA-IR'
colnames(CombinedData)[colnames(CombinedData)=='MetabolicAssociatedLiverDisease[, \"PFAS\"]']='PFAS';
colnames(CombinedData)[colnames(CombinedData)=="MetabolicAssociatedLiverDisease[, 1]" ]='PatientNumber';#colnames(CombinedData)#
rownames(CombinedData)=unlist(bileAcidsLiverData[,1]);
RelevantColumns=colnames(CombinedData)[!colnames(CombinedData) %in% c( "Benzylparaben" ,"Methylparaben")]
# Not sure when it is the best time to take not needed variables away, perhaps at the very end?
CombinedData=CombinedData[,RelevantColumns]
# Here I add the lipids. In the future, I need to divide all the steroidGroups in their own components
# e.g. dataframe called 'lipids' so that adding uniqueBMIValues will be more straightforward:
CombinedData=cbind(CombinedData,MetabolicAssociatedLiverDisease[,(dim(MetabolicAssociatedLiverDisease)[2]-13):dim(MetabolicAssociatedLiverDisease)[2]])
# hupo=match( colnames(CombinedData)[colnames(CombinedData) %in% groups2[,3]], groups2[,3] ) # do ni; https://www.geeksforgeeks.org/how-to-find-index-of-element-in-vector-in-r/
# tvauxe=CombinedData
# colnames(CombinedData)[colnames(CombinedData) %in% groups2[,3]]=groups2[hupo,2]
# The basic preprocessing is just the below lines:
tve=CombinedData[,2:dim(CombinedData)[2]]; tve[tve == 0] <- NA; #Almost all variables are here
HalfImputedData <- tve %>% mutate(replace(., is.na(.), min(., na.rm = T)/2)) #https://mdatools.com/docs/preprocessing--autoscaling.html
Log2TransformedData <- log2(HalfImputedData);
AutoScaledData <- prep.autoscale(as.matrix(Log2TransformedData), center = TRUE, scale = TRUE); #https://svkucheryavski.gitbooks.io/mdatools/content/preprocessing/text.html
AllData=cbind(CombinedData[,1],AutoScaledData);
# Changing the column names needs to have separate variables for each type of variable (contaminant, steroid, etc.)
x1=colnames(AllData[,c(1:8)]); v2=dim(NonAlcoholicFattyLiverDisease)[2]+1
x2=colnames(AllData[,9:v2]);v3=(dim(bileAcidsLiverData)[2]+v2);x3=colnames(AllData[,(v2+1):(v3)]);v4=(dim(serumBileAcidsData)[2])+v3
x4=colnames(AllData[,(v3+1):(v4-1)]);x5=colnames(AllData[,(v4):(dim(AllData)[2])]);
x3 <- paste(x3, "_L", sep="") #https://stackoverflow.com/questions/6984796/how-to-paste-a-string-on-each-element-of-a-vector-of-strings-using-apply-in-r
x4=gsub("(-[0-9]*)*.1", "", x4) #https://stackoverflow.com/questions/18997297/remove-ending-of-string-with-gsub
x4 <- paste(x4, "_S", sep="")# https://rdrr.io/bioc/qpgraph/man/qpNrr.html
x5a=x5[1:9]
x6=x5[10:length(x5)] #Dividing to lipids
x5=x5a #Making sure that PFAS are separate
nm = c(x1,x2,x3,x4,x5,x6); nm=c('PatientNumber','Gender','AGE','BMI','Steatosis Grade','Fibrosis Stage','Necroinflammation','HOMA-IR',nm[9:length(nm)])
colnames(AllData)=nm; #AllData[1:5,1:30]; #NonAlcoholicFattyLiverDisease[1:2,1:28];
colnames(AllData)[colnames(AllData)=='MetabolicAssociatedLiverDisease[, \"PFAS\"]']='PFAS';
# This (deletion) is good to do after all the previous:
x5=x5[x5!='PFAS'];x5=x5[x5!='Perfluorodecyl.ethanoic.acid']; x6=x6[x6!='Total_TG'] # x1;x2;x3;x4;x5;
AllData=AllData[,!colnames(AllData) %in% c('Total_TG','PFAS',"Perfluorodecyl.ethanoic.acid")]
# In case you would need just the logged values:
tv_half_log22=cbind(CombinedData[,1],Log2TransformedData);
x1=colnames(tv_half_log22[,c(1:8)]); v2=dim(NonAlcoholicFattyLiverDisease)[2]+1
x2=colnames(tv_half_log22[,9:v2]);v3=(dim(bileAcidsLiverData)[2]+v2);
x3=colnames(tv_half_log22[,(v2+1):(v3)]);v4=(dim(serumBileAcidsData)[2])+v3
x3=x3[c(length(x3),1:(length(x3)-1))]
x4=colnames(tv_half_log22[,(v3+1):(v4-1)]);
x5=colnames(tv_half_log22[,(v4):(dim(tv_half_log22)[2])]);
x3 <- paste(x3, "_L", sep="")
#https://stackoverflow.com/questions/6984796/how-to-paste-a-string-on-each-element-of-a-vector-of-strings-using-apply-in-r
x4=gsub("(-[0-9]*)*.1", "", x4) #https://stackoverflow.com/questions/18997297/remove-ending-of-string-with-gsub
x4 <- paste(x4, "_S", sep="")# https://rdrr.io/bioc/qpgraph/man/qpNrr.html
x5a=x5[1:9]
x6=x5[10:length(x5)] #dividing to lipids
x5=x5a #making sure that PFAS are separate
nm = c(x1,x2,x3,x4,x5,x6); nm=c('PatientNumber','Gender','AGE','BMI','Steatosis Grade','Fibrosis Stage','Necroinflammation','HOMA-IR',nm[9:length(nm)])
colnames(tv_half_log22)=nm; #tv_half_log22[1:5,1:30]; #NonAlcoholicFattyLiverDisease[1:2,1:28];
colnames(tv_half_log22)[colnames(tv_half_log22)=='MetabolicAssociatedLiverDisease[, \"PFAS\"]']='PFAS';
# This (deletion) is good to do after all the previous:
x5=x5[x5!='PFAS'];x5=x5[x5!='Perfluorodecyl.ethanoic.acid']; x6=x6[x6!='Total_TG'] # x1;x2;x3;x4;x5;
tv_half_log22=tv_half_log22[,!colnames(tv_half_log22) %in% c('Total_TG','PFAS',"Perfluorodecyl.ethanoic.acid")]
# This needs to be done early on:
colnames(CombinedData)[colnames(CombinedData)=='17aOH-P4']='17a-OHP4'
colnames(tv_half_log22)[colnames(tv_half_log22)=='17aOH-P4']='17a-OHP4'
colnames(AllData)[colnames(AllData)=='17aOH-P4']='17a-OHP4'
AllData=AllData[,!colnames(AllData) %in% c('Total_TG','PFAS','Perfluorodecyl.ethanoic.acid')]
AllData=AllData[,!colnames(AllData) %in% x4]
# In case you would need nonscaled covariates and scaled/logged all other variables:
CovariatesScaledData=AllData
CovariatesNonScaledData=cbind(CombinedData[,1:8],AllData[,9:dim(AllData)[2]])
LogCovariatesScaledData=tv_half_log22
LogCovariatesNonScaledData=cbind(CombinedData[,1:8],tv_half_log22[,9:dim(tv_half_log22)[2]])
colnames(CovariatesNonScaledData)[1:8]=colnames(AllData)[1:8]
colnames(LogCovariatesNonScaledData)[1:8]=colnames(AllData)[1:8]
# This is needed occasionally:
CurrentData=CovariatesScaledData
# https://stackoverflow.com/questions/6984796/how-to-paste-a-string-on-each-element-of-a-vector-of-strings-using-apply-in-r
# https://stackoverflow.com/questions/18997297/remove-ending-of-string-with-gsub # https://rdrr.io/bioc/qpgraph/man/qpNrr.html
hupo=match( colnames(CurrentData)[colnames(CurrentData) %in% groups2[,3]], groups2[,3] )
# do ni; https://www.geeksforgeeks.org/how-to-find-index-of-element-in-vector-in-r/
tvauxe=CurrentData
colnames(CurrentData)[colnames(CurrentData) %in% groups2[,3]]=groups2[hupo,2]
# This needs to be done also soon, to gather all the treatment etc. variable names separately...:
TreatmentVariables=colnames(AllData)[52:58];
MediatorVariables=colnames(AllData)[9:28];
OutcomeVariables=colnames(AllData)[c(29:51,59:71)]; ##https://sparkbyexamples.com/r-programming/r-remove-from-vector-with-examples/
OutcomeVariables=OutcomeVariables[!OutcomeVariables %in% c('Total_TG','PFAS','Perfluorodecyl.ethanoic.acid')]
OutcomeVariables=OutcomeVariables[! OutcomeVariables %in% x4] #https://sparkbyexamples.com/r-programming/r-remove-from-vector-with-examples/
MediatorVariables[MediatorVariables=="17aOH-P4"]="17a-OHP4"
TreatmentVariables=TreatmentVariables[!TreatmentVariables %in% c('Perfluorodecyl.ethanoic.acid')]
colnames(CovariatesScaledData)[colnames(CovariatesScaledData)=='C4Data_L']='C4_L'
colnames(CurrentData)[colnames(CurrentData)=='C4Data_L']='C4_L'
colnames(tvauxe)[colnames(tvauxe)=='C4Data_L']='C4_L'
colnames(AllData)[colnames(AllData)=='C4Data_L']='C4_L'
colnames(NonAlcoholicFattyLiverDisease)[colnames(NonAlcoholicFattyLiverDisease)=='C4Data_L']='C4_L'
x3[x3=='C4Data_L']='C4_L'
# Let's introduce some randomization. First, let us define the range of the random numbers:
# min_val <- 0.83; max_val <- 1.17
#
# # Then a function for the randomization:
# multiply_with_random <- function(df, cols, min_val, max_val) {
# for (col in cols) {
# if (col %in% names(df)) {
# df[[col]] <- df[[col]] * runif(nrow(df), min = min_val, max = max_val)
# } else {
# warning(paste("Column", col, "does not exist in the data frame"))}}
# return(df)}
#
# CovariatesScaledData=multiply_with_random(CovariatesScaledData, 3:(dim(CovariatesScaledData)[2]), min_val, max_val)
# CurrentData=multiply_with_random(CurrentData, 3:(dim(CurrentData)[2]), min_val, max_val)
# tvauxe=multiply_with_random(tvauxe, 3:(dim(tvauxe)[2]), min_val, max_val)
# AllData=multiply_with_random(AllData, 3:(dim(AllData)[2]), min_val, max_val)
# Just in case:
# save.image('thereal_v2.RData')
# setwd("C:/Users/patati/Documents/GitHub/Steroid_Data_Analysis")
# Got some help for commenting and variable namings from Copilot to above code.
# Also multiply_with_random was done together with Copilot.
Setting Global Variables
options(scipen = 999) # Disable scientific notation
# rm(list = ls()) # Clear workspace; this should not be if you have the load above
thedate <- strftime(Sys.Date(), "%d%m%y") #Do not take the old date from the load...
date <- paste0('tikka', thedate) # Customize this as needed
knitr::opts_knit$set(root.dir = 'C:/Users/patati/Documents/GitHub/Steroid_Data_Analysis') #Is this working? Yes, but in knitr:https://forum.posit.co/t/setting-working-directory-in-rmarkdown/70849
# Example installation commands
# remotes::install_github("fossbert/binilib", force=TRUE)
# install.packages(c('tidyverse', 'tibble'))
# if (!require("BiocManager", quietly = TRUE)) install.packages("BiocManager")
# BiocManager::install("Maaslin2")
# devtools::install_github("davidsjoberg/ggforestplot")
# remotes::install_version("insight", version = "0.20.5", repos = "http://cran.us.r-project.org", force=TRUE)
# font_import() # Import fonts if not already done
# loadfonts(device = "win") # Load fonts for Windows
# renv::status() # Check renv status
# library(rmarkdown); render("path/to/file.Rmd") # Render R Markdown document
# remove.packages("DelayedArray")
# BiocManager::install("DelayedArray")
# install.packages("Require") # Install 'Require' package
# If the installation to the project file is problematic, do it to the general folder and add it to the .libPaths(), as above library import.
Making Boxplots
#https://r-graph-gallery.com/265-grouped-boxplot-with-ggplot2.html
#https://stackoverflow.com/questions/53724834/why-does-the-plot-size-differ-between-docx-and-html-in-rmarkdownrender
CreateBoxplots <- function(tvt, Group, OutcomeVariables, Out, oute, other) {
# Filter data based on gender
tvt <- tvt %>%
filter(if (Group == 'Male') Gender == 2 else if (Group == 'Female') Gender == 1 else TRUE)
# Prepare data for plotting
Steroid <- rep(colnames(tvt[, 9:28]), each = nrow(tvt))
data2 <- rep('Control', nrow(tvt))
num <- ifelse(OutcomeVariables == 'HOMA-IR', 1.5, min(tvt[[OutcomeVariables]]))
data2[tvt[[OutcomeVariables]] > num] <- 'Case'
TreatmentVariables <- data2
Concentration <- as.vector(unlist(tvt[, 9:28]))
data <- data.frame(Steroid, TreatmentVariables, Concentration)
data$Group <- 0
# Correct steroid names if the level exists
if ("17aOH-P4" %in% levels(data$Steroid)) {
data <- data %>%
mutate(Steroid = fct_recode(Steroid, '17a-OHP4' = '17aOH-P4'))
}
# Assign steroidGroups
rownames(groups2) <- 1:20
for (i in seq_len(nrow(groups2))) {
data[data$Steroid %in% groups2$Abbreviation[i], 'Group'] <- groups2$Group[i]
}
# Set plot title
title <- paste(Out, "'s Effect on Concentrations of Steroids in ", Group, sep = "")
# Define legend labels
e1 <- ifelse(num == 1.5, paste('Case (>', num, ')', sep = ""), paste('Case (>', 0, ')', sep = ""))
e2 <- ifelse(num == 1.5, paste('Control (<=', num, ')', sep = ""), paste('Control (=', 0, ')', sep = ""))
# Remove rows with NA concentrations
data <- data %>% filter(!is.na(Concentration))
# Create boxplot
p <- ggplot(data, aes(x = Steroid, y = Concentration, fill = TreatmentVariables)) +
geom_boxplot(notch = FALSE, notchwidth = 0.5, outlier.shape = 1, outlier.size = 2, coef = 1.5) +
theme_classic2() +
theme(axis.text.x = element_text(angle = 90, hjust = 0.95, vjust = 0.2, size = 10.5),
axis.text = element_text(color = "black"),
panel.grid.minor = element_blank(),
text = element_text(size = 10.5, family = "Calibri"),
axis.title = element_text(size = 14),
plot.title = element_text(size = 14),
legend.text = element_text(size = 14),
legend.title = element_text(size = 14)) +
labs(x = "Steroids", y = "Log2 of Picomolar Concentrations", title = title) +
scale_fill_manual(values = c("orange", "blue"), name = oute, labels = c(e1, e2)) +
facet_grid(~Group, scales = "free_x", space = "free") +
stat_compare_means(hide.ns = TRUE, label = "p.signif", method = "wilcox.test",
symnum.args = list(cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
symbols = c("****", "***", "**", "*", "ns")),
size = 8, paired = FALSE, label.y = 15.5)
# Save plot as PNG
path <- "C:/Users/patati/Documents/GitHub/Steroid_Data_Analysis/"
pngfile <- fs::path(path, paste0(Group, Out, 'boxe', ".png"))
ragg::agg_png(pngfile, width = 60, height = 36, units = "cm", res = 300,scaling = 2)
plot(p)
invisible(dev.off())
knitr::include_graphics(pngfile)
}
# Example usage
LogCovariatesNonScaledData[, '11-KA4'][LogCovariatesNonScaledData[, '11-KA4'] == min(LogCovariatesNonScaledData[, '11-KA4'])] <- median(LogCovariatesNonScaledData[, '11-KA4'])
other <- '261124'
ie <- LogCovariatesNonScaledData#
hupo <- match(colnames(ie)[colnames(ie) %in% groups2[, 3]], groups2[, 3])
colnames(ie)[colnames(ie) %in% groups2[, 3]] <- groups2[hupo, 2]
windowsFonts(A = windowsFont("Calibri (Body)"))
knitr::opts_knit$set(root.dir = 'C:/Users/patati/Documents/GitHub/Steroid_Data_Analysis')
# The significance levels are: '****<0.001', '***<0.01', '**<0.05', '*<0.1'
OutcomeVariables='Steatosis Grade';Out='Steatosis'; oute='Steatosis Grade';num=0;Group='All';
CreateBoxplots(ie,Group,OutcomeVariables,Out,oute,other);Group='Female';
CreateBoxplots(ie,Group,OutcomeVariables,Out,oute,other);Group='Male';
CreateBoxplots(ie,Group,OutcomeVariables,Out,oute,other)
OutcomeVariables='Fibrosis Stage';Out='Fibrosis'; oute='Fibrosis Stage';num=0;Group='All';
CreateBoxplots(ie,Group,OutcomeVariables,Out,oute,other);Group='Female';
CreateBoxplots(ie,Group,OutcomeVariables,Out,oute,other);Group='Male';
CreateBoxplots(ie,Group,OutcomeVariables,Out,oute,other)
# https://www.elsevier.es/en-revista-annals-hepatology-16-articulo-assessment-hepatic-fibrosis-necroinflammation-among-S1665268119314590 #So it is in grade
OutcomeVariables='Necroinflammation';Out='Necroinflammation'; oute='Necroinflammation Grade';num=0;Group='All';
CreateBoxplots(ie,Group,OutcomeVariables,Out,oute,other);Group='Female';
CreateBoxplots(ie,Group,OutcomeVariables,Out,oute,other);Group='Male';
CreateBoxplots(ie,Group,OutcomeVariables,Out,oute,other)
OutcomeVariables='HOMA-IR';Out='HOMA-IR'; oute='HOMA-IR';num=1.5;Group='All';
CreateBoxplots(ie,Group,OutcomeVariables,Out,oute,other);Group='Female';
CreateBoxplots(ie,Group,OutcomeVariables,Out,oute,other);Group='Male';
CreateBoxplots(ie,Group,OutcomeVariables,Out,oute,other)
Making Forest Plots
# Define the NonAlcoholicFattyLiverDisease dataset by selecting the first 28 columns from CombinedData
NonAlcoholicFattyLiverDisease <- CombinedData[, 1:28]
# Convert specific columns to binary values using vectorized operations
cols_to_binary <- c(5, 6, 7)
NonAlcoholicFattyLiverDisease[, cols_to_binary] <- (NonAlcoholicFattyLiverDisease[, cols_to_binary] > 0) * 1
# Convert column 8 to binary based on the threshold of 1.5
NonAlcoholicFattyLiverDisease[, 8] <- (NonAlcoholicFattyLiverDisease[, 8] > 1.5) * 1
# Clean column names to remove special characters and make uniqueBMIValues consistent
patterns <- c("-", "/", "11", "17", "#")
replacements <- c(".", ".", "X11", "X17", ".") #?
# Ensure patterns and replacements are correctly paired
if (length(patterns) == length(replacements)) {
for (i in seq_along(patterns)) {
colnames(NonAlcoholicFattyLiverDisease) <- gsub(patterns[i], replacements[i], colnames(NonAlcoholicFattyLiverDisease))}} else {
stop("Patterns and replacements vectors must be of the same length.")}
# This works with the autoscaled (raw if loge=1 and remove 1 in the means) data NonAlcoholicFattyLiverDisease as well...
CalculateErrors=function(NonAlcoholicFattyLiverDisease,OutcomeVariables,Group,name,ordera,oute,first,e,xlim) { # Group='Female'
# Filter data based on the 'Group' variable
NAFLDo <- switch(Group,
"Male" = NonAlcoholicFattyLiverDisease[NonAlcoholicFattyLiverDisease[,'SEX.1F.2M'] == 2, ],
"Female" = NonAlcoholicFattyLiverDisease[NonAlcoholicFattyLiverDisease[,'SEX.1F.2M'] == 1, ],
"All" = NonAlcoholicFattyLiverDisease)
# Initialize vectors to store sample data and counts
sample_data <- list();
n0 <- n1 <- 0
# Loop through the two steroidGroups (OutcomeVariables == 0 and OutcomeVariables > 0)
for (i in 1:2) {
SG0 <- if (i == 1) {
NAFLDo[NAFLDo[, OutcomeVariables] == 0, ]
} else {
NAFLDo[NAFLDo[, OutcomeVariables] > 0, ]}
# Store the count of samples in each group
if (i == 1) {
n0 <- nrow(SG0)
} else {
n1 <- nrow(SG0)}
# Calculate medians and standard deviations for columns 9 to 28
means <- apply(SG0[, 9:28], 2, median, na.rm = TRUE)
sds <- apply(SG0[, 9:28], 2, sd, na.rm = TRUE)
# Calculate error margins
error_lower <- means - sds
error_upper <- means + sds
error <- sds
# Append results to sample_data
sample_data[[i]] <- data.frame(study = colnames(NonAlcoholicFattyLiverDisease[, 9:28]),
index = colnames(NonAlcoholicFattyLiverDisease[, 9:28]),
result = means,
error = error)}
df=data.frame(sample_data) #
# Calculate p-values using Wilcoxon test for columns 9 to 28
ps <- sapply(9:28, function(j) {
xnam <- colnames(NAFLDo)[j]
fmla <- as.formula(paste(xnam, "~", OutcomeVariables))
wilcox.test(fmla, data = NAFLDo, exact = FALSE)$p.value})
#https://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test
# Calculate the ratio of results and log-transform
a <- df[df[, 1] == e, 'result.1'] / df[df[, 1] == e, 'result']
ResComb <- data.frame(log(df$result.1 / df$result))
# Rename columns and clean up variable names
ResComb$result <- ResComb[, 1]
ResComb$name <- df$study
ResComb <- ResComb[, 2:3]
ResComb$name <- gsub("\\.", "-", ResComb$name)
ResComb$name <- gsub("X11", "11", ResComb$name)
ResComb$name <- gsub("X17", "17", ResComb$name)
ResComb$name[ResComb$name == "T-Epi-T"] <- "T/Epi-T"
ResComb$pval <- ps
# Calculate result_pure and error
ResComb$result_pure <- df$result.1 / df$result
ResComb$error <- (abs((1 / df$result) * df$error.1) + abs((df$result.1 / df$result^2) * df$error)) / nrow(NAFLDo) * 1.64
# Adjust error values
ResComb$error <- ifelse(ResComb$error > (median(ResComb$error) + sd(ResComb$error)), median(ResComb$error) * 1.25, ResComb$error)
# Calculate error bounds and log-transformed values
ResComb$errord1a <- ResComb$result_pure - ResComb$error
ResComb$errord2a <- ResComb$result_pure + ResComb$error
ResComb$errord1 <- log(ResComb$errord1a)
ResComb$errord2 <- log(ResComb$errord2a)
ResComb$result <- log(ResComb$result_pure)
ResComb$Control <- df$result
ResComb$Case <- df$result.1
# Add p-values and significance
ResComb$pval0 <- ResComb$pval
ResComb$pval1 <- ResComb$pval
ResComb$Significance0 <- ifelse(ResComb$pval0 < 0.1, 'Yes', 'No')
ResComb$Color0 <- ifelse(ResComb$pval0 < 0.1, 'blue', 'grey')
ResComb$Significance1 <- ifelse(ResComb$pval1 < 0.1, 'Yes', 'No')
ResComb$Color1 <- ifelse(ResComb$pval1 < 0.1, 'blue', 'grey')
# Merge with group data and sort
gn <- steroidGroups[steroidGroups$Abbreviation != 'F', c('Group', 'Abbreviation')]
gn <- gn[order(gn$Abbreviation), ]
ResComb <- ResComb[order(ResComb$name), ]
ResComb <- cbind(ResComb, gn[order(gn$Abbreviation), ])
ResComb <- ResComb[order(-ResComb$result), ]
xlab = "Autoscaled Concentrations (SE)"
xlim=c(min(ResComb$errord1),max(ResComb$errord2))
# Create forest plot
plote2 <- forestplot(df = ResComb,
estimate = result,
se = 0,
pvalue = pval1,
psignif = 0.1,
xlim = xlim,
xlab = 'Logged Ratio between Raw Concentrations of Case and Control with 90% CI',
ylab = 'Steroid Groups',
title = '',
colour = Significance1) +
ggforce::facet_col(facets = ~Group, scales = "free_y", space = "free", strip.position = 'left') +
geom_errorbarh(aes(xmin = errord1, xmax = errord2, height = .0, colour = Significance1))
# Set color palette
hp <- if (sum(ResComb$Significance1 == 'Yes') == 20) c('blue', 'blue') else c('#999999', 'blue')
# Order factor levels based on Group and first
if (Group=='All' & first==TRUE) {ordera=rev(steroidGroups$Abbreviation)#ResComb$name[order(ResComb$result)]; #
plote2[["data"]][["name"]]=factor(plote2[["data"]][["name"]], levels = ordera)} else if
(Group=='All' & first==FALSE) {plote2[["data"]][["name"]]=factor(plote2[["data"]][["name"]], levels = ordera)} else if
(Group=='Female') {plote2[["data"]][["name"]]=factor(plote2[["data"]][["name"]], levels = ordera)} else if
(Group=='Male') {plote2[["data"]][["name"]]=factor(plote2[["data"]][["name"]], levels = ordera)}
#https://www.r-bloggers.com/2020/03/how-to-standardize-group-colors-in-data-visualizations-in-r/
plote2$layers[[1]]$aes_params$odd <- "#00000000" #https://stackoverflow.com/questions/71745719/how-to-control-stripe-transparency-using-ggforestplot-geom-stripes
ResComb$Group2=ResComb$Group
ResComb <- transform(ResComb,Group2 = as.numeric(as.factor(Group2)))
ResComb$facet_fill_color <- c("red", "green", "blue", "yellow", "brown")[ResComb$Group2]
# Create plot with custom themes
PlotVar <- plote2 + theme(axis.text.y = element_blank()) + theme_classic2()
PlotVar2 <- PlotVar + geom_point(aes(colour = factor(Significance1)), colour = ResComb$Color1) +
scale_color_manual(values = hp) + theme(legend.position = "none") + theme(strip.text.y = element_text(size = -Inf))
# Customize facet strip colors
g <- ggplot_gtable(ggplot_build(PlotVar2))
stripr <- which(grepl('strip-l', g$layout$name))
fills <- c("red", "green", "blue", "yellow", "brown")
for (i in seq_along(stripr)) {
j <- which(grepl('rect', g$grobs[[stripr[i]]]$grobs[[1]]$childrenOrder))
g$grobs[[stripr[i]]]$grobs[[1]]$children[[j]]$gp$fill <- fills[i]}
# grid::grid.draw(g)
# Save plot as JPEG and convert to PDF and SVG
setwd("C:/Users/patati/Documents/GitHub/Steroid_Data_Analysis")
jpeg(paste(name, "divi.jpg"), width = 7500, height = 11000, quality = 100, pointsize = 16, res = 1000)
print(grid::grid.draw(g))
dev.off()
daiR::image_to_pdf(paste(name, "divi.jpg"), pdf_name = paste0(paste(name, "divi.jpg"), '.pdf'))
my_image <- image_read(paste(name, "divi.jpg"))
my_svg <- image_convert(my_image, format = "svg")
image_write(my_svg, paste(name, "divi.svg"))
return(ordera) #If you do not want to have 'null' to the Rmarkdown/html take this away
}
# This is with first(!!). Use it.
OutcomeVariables='Steatosis.Grade.0.To.3';Out='Steatosis'; oute='Steatosis';first=TRUE; e='P4';ordera=c();
Group='All';name1=paste("Forest plot of",Group, "Steroid Ratios in",Out);
hel=CalculateErrors(NonAlcoholicFattyLiverDisease,OutcomeVariables,Group,name1,ordera,oute,first,e,xlim)
# #Afterwards:
first=FALSE;
Group='Female';name2=paste("Forest plot of",Group, "Steroid Ratios in",Out);
CalculateErrors(NonAlcoholicFattyLiverDisease,OutcomeVariables,Group,name2,ordera=hel,oute,first,e,xlim)
Group='Male'; name3=paste("Forest plot of",Group, "Steroid Ratios in",Out);
CalculateErrors(NonAlcoholicFattyLiverDisease,OutcomeVariables,Group,name3,ordera=hel,oute,first,e,xlim)
#
OutcomeVariables='Fibrosis.Stage.0.to.4'; Out='Fibrosis';oute='Fibrosis';
Group='All'; name4=paste("Forest plot of",Group, "Steroid Ratios in",Out);
CalculateErrors(NonAlcoholicFattyLiverDisease,OutcomeVariables,Group,name4,ordera=hel,oute,first,e,xlim)
Group='Female';name5=paste("Forest plot of",Group, "Steroid Ratios in",Out);
CalculateErrors(NonAlcoholicFattyLiverDisease,OutcomeVariables,Group,name5,ordera=hel,oute,first,e,xlim)
Group='Male'; name6=paste("Forest plot of",Group, "Steroid Ratios in",Out);
CalculateErrors(NonAlcoholicFattyLiverDisease,OutcomeVariables,Group,name6,ordera=hel,oute,first,e,xlim)
#
OutcomeVariables='Necroinflammation'; Out='Necroinflammation';oute='Necroinflammation';
Group='All'; name7=paste("Forest plot of",Group, "Steroid Ratios in",Out);
CalculateErrors(NonAlcoholicFattyLiverDisease,OutcomeVariables,Group,name7,ordera=hel,oute,first,e,xlim) #not the very first though...
Group='Female';name8=paste("Forest plot of",Group, "Steroid Ratios in",Out);
CalculateErrors(NonAlcoholicFattyLiverDisease,OutcomeVariables,Group,name8,ordera=hel,oute,first,e,xlim)
Group='Male'; name9=paste("Forest plot of",Group, "Steroid Ratios in",Out);
CalculateErrors(NonAlcoholicFattyLiverDisease,OutcomeVariables,Group,name9,ordera=hel,oute,first,e,xlim)
#
OutcomeVariables='HOMA.IR';Out='HOMA-IR';oute='HOMAIR';
Group='All';name10=paste("Forest plot of",Group, "Steroid Ratios in",Out);
CalculateErrors(NonAlcoholicFattyLiverDisease,OutcomeVariables,Group,name10,ordera=hel,oute,first,e,xlim) #not the very first though...
Group='Female';name11=paste("Forest plot of",Group, "Steroid Ratios in",Out);
CalculateErrors(NonAlcoholicFattyLiverDisease,OutcomeVariables,Group,name11,ordera=hel,oute,first,e,xlim)
Group='Male'; name12=paste("Forest plot of",Group, "Steroid Ratios in",Out);
CalculateErrors(NonAlcoholicFattyLiverDisease,OutcomeVariables,Group,name12,ordera=hel,oute,first,e,xlim)
# Fyi: I was able to revise some of the above codes with Copilot...
Forest plot of All Steroid Ratios in Steatosis
Forest plot of Female Steroid Ratios in Steatosis
Forest plot of Male Steroid Ratios in Steatosis