Introduction
# Welcome to the 'steroid data analysis' webpage! a
# The procedures and explanations to make all the analysis and plots are in their individual chapters below.
# These methods could be also easily applied to other types of data sets and metabolites than 'steroids' and their respective metadata per se.
# In addition, there is a small 'disclaimer' also at the end of this webpage to emphasize that this site is mainly for educational purposes.
# Please let me know if you have any questions. For that, use the 'following' email: patati at the university of Turku
Loading Required R Packages
# Set library paths if needed
# .libPaths(c("C:/Program Files/R/R-4.4.1/library", .libPaths()))
# Fyi: adding echo=FALSE here (at the R markdown header) does not show this code block.
# List of libraries to load (alphabetically sorted)
packages <- c("bigsnpr", "binilib", "brickster", "car", "censReg", "circlize", "ComplexHeatmap", "correlation",
"corrplot", "daiR", "datarium", "dmetar", "dplyr", "effsize", "extrafont", "forcats", "fs", "FSA",
"ggcorrplot", "ggforce", "ggforestplot", "ggplot2", "ggplotify", "ggpubr", "ggsankey", "ggsankeyfier",
"ggh4x", "ggtext", "glmnet", "grid", "Hmisc", "hrbrthemes", "igraph", "insight", "lavaan", "lmtest",
"lme4", "lsr", "magick", "magrittr", "Maaslin2", "mdatools", "mediation", "meta", "mgcv", "mlma",
"MOFA2", "pheatmap", "PerformanceAnalytics", "pathviewr", "plyr", "plotrix", "ppcor", "prettydoc",
"psych", "quantreg", "qpgraph", "ragg", "RColorBrewer", "rcompanion", "readxl", "remotes", "reshape2",
"rgl", "rmarkdown", "rmdformats", "rstatix", "scales", "scater", "scatterplot3d", "sjPlot", "stringr",
"superb", "tibble", "tidyverse", "tint", "tufte", "viridis","WGCNA", "xlsx")
# Load all libraries
.libPaths(c("C:/Program Files/R/R-4.4.1/library", .libPaths()))
# invisible(lapply(packages, library, character.only = TRUE))
# invisible(lapply(packages, function(pkg) {
# if (!requireNamespace(pkg, quietly = TRUE)) {
# install.packages(pkg)
# }
# library(pkg, character.only = TRUE)
# }))
#
library(scales)
invisible(lapply(setdiff(packages, "scales"), library, character.only = TRUE))
# Note: Do not load 'forestplot' as it conflicts with 'ggforestplot'
# You may need to load both rmarkdown and rmdformats and install some dependencies (if prompted) to get this loading done.
# If done together with Github, this maybe good to know: https://stackoverflow.com/questions/11384928/change-git-repository-directory-location
# Install packages if not already installed
# renv::install() # Installs from the basic R repository
# if (!require("BiocManager", quietly = TRUE)) install.packages("BiocManager")
# BiocManager::install(c("ComplexHeatmap", "DESeq2", "dmetar", "fgsea", "ggforestplot", "ggsankey", "limma", "Maaslin2", "metagenomeSeq", "MOFA2", "qpgraph", "scater", "scRNAseq", "sevenbridges"))
# remotes::install_github(c("davidsjoberg/ggsankey", "fossbert/binilib", "MathiasHarrer/dmetar", "mattflor/chorddiag", "NightingaleHealth/ggforestplot"))
# devtools::install_github("mattflor/chorddiag") # Alternative installation method
#Fyi: Run Current Chunk:
# Ctrl + Alt + C (Windows/Linux)
# Run All Chunks Above:
# Ctrl + Alt + P (Windows/Linux)
# Run All Chunks Below:
# Ctrl + Alt + N (Windows/Linux)
# Run All Chunks in Document:
# Ctrl + Alt + R (Windows/Linux)
Importing Data and Metadata
# First set your data folder:
setwd("C:/Users/patati/Documents/GitHub/Steroid_Data_Analysis")
# or:"C:/Users/patati/Desktop/Turku/R" #check the wd with: here::here() #or getwd()
load("thereal_v2.RData") #This is so to say real data, and it is not available here (at the site).
# It is easier to load the ready stiched data in one go with .RData file than one by one as below, but
# for educational purposes I put some examples what may need to be done to get your data in ok form
# for later purposes
setwd("C:/Users/patati/Desktop/Turku/R")
NonAlcoholicFattyLiverDisease=read_excel("NAFLD_SteroidStudy.xlsx",sheet = "LFAT_steroidsDATA") # This is partly auxiliary
columnNames=colnames(NonAlcoholicFattyLiverDisease); NonAlcoholicFattyLiverDisease=data.frame(NonAlcoholicFattyLiverDisease)
#The names of the steroid steroidGroups need to be imported early on:
steroidGroups=read.csv("groups_17823.csv", header = TRUE, sep=";")
steroidGroups=steroidGroups[,c('Group','Abbreviation')]
steroidGroups=steroidGroups[steroidGroups[,'Abbreviation']!='F',]
steroidGroups=steroidGroups[order(steroidGroups[,'Group']),]
steroidGroups[,'Abbreviation'][steroidGroups[,'Abbreviation']=='17aOH-P4']='17a-OHP4'
groupValues=steroidGroups
steroidNames=read_excel("NAFLD_SteroidStudy_for groups.xlsx",sheet = "Steroid name abbreviations")
# This is partly auxiliary
groups2=data.frame(steroidNames)[,1:4]; g1=read.csv("groups_17823.csv", header = TRUE, sep=";")
groups2=cbind(g1[,'Group'], groups2[,c('Abbreviation','Abbreviation_old','Name')])
groups2=groups2[groups2[,'Abbreviation']!='FF',]; colnames(groups2)[1]="Group";
groups2=groups2[order(groups2[,'Group']),]
groups2[,'Abbreviation'][groups2[,'Abbreviation']=='17aOH-P4']='17a-OHP4'
groups2[,'Group'][groups2[,'Abbreviation_old']=='DOC']='Mineralocorticoids'
# head(groups2)
#P4 was found from elsewhere to have the following characteristics:
NonAlcoholicFattyLiverDisease[,'P4'] = as.numeric(NonAlcoholicFattyLiverDisease[,'P4'])
# NonAlcoholicFattyLiverDisease[,'P4'][is.na(NonAlcoholicFattyLiverDisease[,'P4'])] = 22557.3330346846
# median(NonAlcoholicFattyLiverDisease[,'P4'], na.rm=TRUE)
NonAlcoholicFattyLiverDisease[,5:7][NonAlcoholicFattyLiverDisease[,5:7]==0.01]=0; colnames(NonAlcoholicFattyLiverDisease)=columnNames
MetabolicAssociatedLiverDisease=read_excel("Combined.Matrix.For.Pauli.2023.10.17.Excel.Formatv2.xlsx") # This is the main file
columnNames=colnames(MetabolicAssociatedLiverDisease); MetabolicAssociatedLiverDisease=data.frame(MetabolicAssociatedLiverDisease); colnames(MetabolicAssociatedLiverDisease)=columnNames
# All kinds of tricks are needed for getting the right data format
rownames(MetabolicAssociatedLiverDisease)=MetabolicAssociatedLiverDisease[,1]
MetabolicAssociatedLiverDisease[,'P4'] = as.numeric(MetabolicAssociatedLiverDisease[,'P4'])
#The same comment as above
MetabolicAssociatedLiverDisease[,'P4'][is.na(MetabolicAssociatedLiverDisease[,'P4'])] = 22557.3330346846
evaluationCriteria=c('Grade(0-3)', 'Stage(0-4)','Necroinflammation')
MetabolicAssociatedLiverDisease[,evaluationCriteria][MetabolicAssociatedLiverDisease[,evaluationCriteria]==0.01]=0;
targetData=c('11-KDHT','AN','DHT','17a-OHP5','E2','P5','DOC')
valueList=c(103,252,51,200,26.5,253,10); valueListAdjusted=c(100,250,50,200,25,250,10)
for (i in 1:7) {MetabolicAssociatedLiverDisease[,targetData][i][MetabolicAssociatedLiverDisease[,targetData][i]==valueList[i]]=valueListAdjusted[i]}
# These (E) are ok as per lab:
menopauseMarkers=read.csv('E_tikka231023.csv',header=TRUE, sep=";")
menopauseMarkersPatients=rownames(MetabolicAssociatedLiverDisease[MetabolicAssociatedLiverDisease[,'E']==106000,])
patientNumbers=menopauseMarkers[which(menopauseMarkers[,1] %in% menopauseMarkersPatients),'patient.number']
markerValues=menopauseMarkers[which(menopauseMarkers[,1] %in% menopauseMarkersPatients),'E']
MetabolicAssociatedLiverDisease[as.character(patientNumbers),'E']=markerValues
# These (11-KA4) will perhaps change in the lab (sometime after 24.10.23):
marker11KA4=read.csv('11KA4_tikka231023.csv',header=TRUE, sep=";")
# marker11KA4[,1][c(1:5,9)];MetabolicAssociatedLiverDisease[as.character(marker11KA4[,1][c(1:5,9)]),'11-KA4']
#These were denoted with 'big interference'
MetabolicAssociatedLiverDisease[as.character(marker11KA4[,1][c(1:5,9)]),'11-KA4'] = NA
#Alternatively: median(MetabolicAssociatedLiverDisease[!rownames(MetabolicAssociatedLiverDisease) %in% as.character(marker11KA4[,1][c(1:5,9)]),'11-KA4'])
BMI_ordered_MASLD=MetabolicAssociatedLiverDisease[order(MetabolicAssociatedLiverDisease[,'BMI']),'BMI']
BMI_ordered_NAFLD=NonAlcoholicFattyLiverDisease[order(NonAlcoholicFattyLiverDisease[,'BMI']),'BMI']
uniqueBMIValues=unique(BMI_ordered_NAFLD[! BMI_ordered_NAFLD %in% BMI_ordered_MASLD])
NonAlcoholicFattyLiverDisease=NonAlcoholicFattyLiverDisease[order(NonAlcoholicFattyLiverDisease[,'BMI']),]
NonAlcoholicFattyLiverDisease=NonAlcoholicFattyLiverDisease[NonAlcoholicFattyLiverDisease[,'BMI']!=uniqueBMIValues,]
MetabolicAssociatedLiverDisease=MetabolicAssociatedLiverDisease[order(MetabolicAssociatedLiverDisease[,'BMI']),]
#https://appsilon.com/imputation-in-r/ #https://www.datasciencemadesimple.com/get-minimum-value-of-a-column-in-r-2/?expand_article=1
# New data import withouth changing the conames: https://readxl.tidyverse.org/articles/column-names.html
bileAcidsLiverData=data.frame(read_excel("Liver_bile_acids_PFAS.xlsx",sheet = "Liver_BA",.name_repair = "minimal")); row.names(bileAcidsLiverData)=bileAcidsLiverData[,1]
PFASSerumData=data.frame(read_excel("Liver_bile_acids_PFAS.xlsx",sheet = "PFAS_serum",.name_repair = "minimal")); rownames(PFASSerumData)=as.vector(unlist(PFASSerumData[,1]))
serumBileAcidsData=data.frame(read_excel("Liver_bile_acids_PFAS.xlsx",sheet = "Serum_BA",.name_repair = "minimal"));rownames(serumBileAcidsData)=as.vector(unlist(serumBileAcidsData[,1]))
C4Data=data.frame(read_excel("Liver_bile_acids_PFAS.xlsx",sheet = "C4",.name_repair = "minimal")); rownames(C4Data)=as.vector(unlist(C4Data[,1]))
clinicalData=data.frame(read_excel("Matching clinical data_all.xlsx",sheet = "Sheet1",.name_repair = "minimal")); rownames(clinicalData)=as.vector(unlist(clinicalData[,1]));
#https://www.analyticsvidhya.com/blog/2021/06/hypothesis-testing-parametric-and-non-parametric-tests-in-statistics/
MetabolicAssociatedLiverDisease[1:2,2:20] #or head(MetabolicAssociatedLiverDisease);
## E 11-KT 17a-OHP5 S B 11b-OHA4 11-KDHT
## 24145190413 47616.43 1013.1342 5023.952 754.5581 8580.997 13735.454 100
## 2459090909 44356.73 935.1387 1111.426 569.7927 6167.239 4709.126 100
## 11-KA4 DHEA T/Epi-T 17aOH-P4 AN DHT A4
## 24145190413 1833.115 12456.910 912.9862 1095.8051 250.0000 50.0000 3065.905
## 2459090909 1004.484 5890.108 10091.7241 864.5884 574.9827 471.7821 1683.676
## DOC P5 P4 A E2
## 24145190413 58.15483 2375.196 270.70746 516.4204 864.3368
## 2459090909 53.65145 1583.961 85.07423 752.5202 60.0817
# The below ordering needs to be changed...
bileAcidsLiverData=bileAcidsLiverData[as.character(MetabolicAssociatedLiverDisease$PatientNumber),];#bileAcidsLiverData[1:3,2:10] #https://stackoverflow.com/questions/54264980/r-how-to-set-row-names-attribute-as-numeric-from-character
# I did otherway around
serumBileAcidsData=serumBileAcidsData[as.character(MetabolicAssociatedLiverDisease$PatientNumber),];#serumBileAcidsData[1:3,2:10]
clinicalData=clinicalData[as.character(MetabolicAssociatedLiverDisease$PatientNumber),];#clinicalData[1:3,2:10]
# Many of these variables are irrelevant/not-used here... so not opening e.g. uniqueBMIValues, they would exhaust this file
C4Data=C4Data[as.character(MetabolicAssociatedLiverDisease$PatientNumber),];#C4Data[1:3,]
PFASSerumData=PFASSerumData[as.character(MetabolicAssociatedLiverDisease$PatientNumber),];#PFASSerumData[1:3,2:10]
# Menopause markers:
menopause=read_excel("Putative_metabolic_markers_menopause.xlsx",sheet='menopause markers',.name_repair = "minimal"); # rownames(clinicalData)=as.vector(unlist(clinicalData[,1]));
menopause=menopause[8:dim(menopause)[1],]; menopause=menopause[,-15]; menopause[2,2:14]=menopause[1,2:14]; menopause=data.frame(menopause); menopause[2,13:14]=c('v1','v2'); #dim(menopause)
colnames(menopause)=c('row_names',menopause[2,2:dim(menopause)[2]]); menopause=menopause[3:dim(menopause)[1],];rownames(menopause)=as.vector(unlist(menopause[,1]));
menopause=menopause[as.character(MetabolicAssociatedLiverDisease$PatientNumber),]
colnames(PFASSerumData)[colnames(PFASSerumData)=='PFHxA.1']='PFHxA_Branched'
PFASSerumData=PFASSerumData[,colnames(PFASSerumData)!='Benzylparaben.1']
PFASSerumData[PFASSerumData[,'Benzylparaben']>10,'Benzylparaben']=NA
Jeihou=data.frame(read_excel("Copy of BA_liverfat_RawData.xls",.name_repair = "minimal")); row.names(Jeihou)=Jeihou[,1];Jeihou=Jeihou[as.character(MetabolicAssociatedLiverDisease$PatientNumber),]
u=Jeihou[Jeihou[,'GHDGA']=='<LLOQ',1]; a=u[!is.na(u)]; b=rownames(bileAcidsLiverData[bileAcidsLiverData[,'GHDGA']==1,]);
uu=Jeihou[Jeihou[,'GHDGA']=='No Result',1]; aa=uu[!is.na(uu)];
bileAcidsLiverData[as.character(a),'GHDGA']=min(bileAcidsLiverData[,'GHDGA'],na.rm=TRUE)/2
AuxBileAcid=bileAcidsLiverData[bileAcidsLiverData[,'GHDGA']==1,1]
bileAcidsLiverData[as.character(AuxBileAcid),'GHDGA']=NA
#https://www.datasciencemadesimple.com/get-minimum-value-of-a-column-in-r-2/?expand_article=1
mat=bileAcidsLiverData[,c('TbMCA','ToMCA','TDCA','TDHCA','TLCA')]
mat[!mat>1]=10000
mat[mat==2]=10000 #Colmins did not work so I used (i.e. colmins did not work so I used):
hip=do.call(pmin, lapply(1:nrow(mat), function(i)mat[i,])) #https://stackoverflow.com/questions/13676878/fastest-way-to-get-min-from-every-column-in-a-matrix
hou=c('TbMCA','ToMCA','TDCA','TDHCA','TLCA')
for (i in 1:5) {bileAcidsLiverData[bileAcidsLiverData[,hou[i]]==1,hou[i]]=hip[i]}
for (i in 1:5) {bileAcidsLiverData[bileAcidsLiverData[,hou[i]]==2,hou[i]]=hip[i]}
# An imputation for the missing values:
C4Data[is.na(C4Data[,2]),2]=median(C4Data[!is.na(C4Data[,2]),2]) #assuming that these were not below quantitation and replacing with median
#https://www.geeksforgeeks.org/performing-logarithmic-computations-in-r-programming-log-log10-log1p-and-log2-functions/
#https://stackoverflow.com/questions/50476717/i-want-to-align-match-two-unequal-columns
#Matching two unequal columns.. match the names of one original column (dat2) to ones that are missing (dat1 with to other) #Not sure if this should be this difficult...
CombinedData=cbind(MetabolicAssociatedLiverDisease[,1],NonAlcoholicFattyLiverDisease[,2:7],clinicalData[,'HOMA.IR'],
MetabolicAssociatedLiverDisease[,colnames(NonAlcoholicFattyLiverDisease[,8:27])],
bileAcidsLiverData[,2:dim(bileAcidsLiverData)[2]],C4Data[,2:dim(C4Data)[2]],
serumBileAcidsData[,2:dim(serumBileAcidsData)[2]],
PFASSerumData[,(2:(dim(PFASSerumData)[2]))], MetabolicAssociatedLiverDisease[,'PFAS']);
colnames(CombinedData)[colnames(CombinedData)=='C4Data[, 2:dim(C4Data)[2]]']='C4Data';colnames(CombinedData)[colnames(CombinedData)=='clinicalData[, \"HOMA.IR\"]']='HOMA-IR'
colnames(CombinedData)[colnames(CombinedData)=='MetabolicAssociatedLiverDisease[, \"PFAS\"]']='PFAS';
colnames(CombinedData)[colnames(CombinedData)=="MetabolicAssociatedLiverDisease[, 1]" ]='PatientNumber';#colnames(CombinedData)#
rownames(CombinedData)=unlist(bileAcidsLiverData[,1]);
RelevantColumns=colnames(CombinedData)[!colnames(CombinedData) %in% c( "Benzylparaben" ,"Methylparaben")]
# Not sure when it is the best time to take not needed variables away, perhaps at the very end?
CombinedData=CombinedData[,RelevantColumns]
# Here I add the lipids. In the future, I need to divide all the steroidGroups in their own components
# e.g. dataframe called 'lipids' so that adding uniqueBMIValues will be more straightforward:
CombinedData=cbind(CombinedData,MetabolicAssociatedLiverDisease[,(dim(MetabolicAssociatedLiverDisease)[2]-13):dim(MetabolicAssociatedLiverDisease)[2]])
# hupo=match( colnames(CombinedData)[colnames(CombinedData) %in% groups2[,3]], groups2[,3] ) # do ni; https://www.geeksforgeeks.org/how-to-find-index-of-element-in-vector-in-r/
# tvauxe=CombinedData
# colnames(CombinedData)[colnames(CombinedData) %in% groups2[,3]]=groups2[hupo,2]
# CombinedData_v2
#
# vec1=colnames(serumBileAcidsData)
# vec2=colnames(bileAcidsLiverData)
#
# common_elements <- intersect(vec1, vec2)
colnames(CombinedData)[colnames(CombinedData)=='C4Data']='C4'
#.1:set on seerumia
# The basic preprocessing is just the below lines:
tve=CombinedData[,2:dim(CombinedData)[2]]; tve[tve == 0] <- NA; #Almost all variables are here
HalfImputedData <- tve %>% mutate(replace(., is.na(.), min(., na.rm = T)/2)) #https://mdatools.com/docs/preprocessing--autoscaling.html
Log2TransformedData <- log2(HalfImputedData);
AutoScaledData <- prep.autoscale(as.matrix(Log2TransformedData), center = TRUE, scale = TRUE); #https://svkucheryavski.gitbooks.io/mdatools/content/preprocessing/text.html
AllData=cbind(CombinedData[,1],AutoScaledData);
# Changing the column names needs to have separate variables for each type of variable (contaminant, steroid, etc.)
x1=colnames(AllData[,c(1:8)]); v2=dim(NonAlcoholicFattyLiverDisease)[2]+1
x2=colnames(AllData[,9:v2]);v3=(dim(bileAcidsLiverData)[2]+v2);x3=colnames(AllData[,(v2+1):(v3)]);v4=(dim(serumBileAcidsData)[2])+v3
x4=colnames(AllData[,(v3+1):(v4-1)]);x5=colnames(AllData[,(v4):(dim(AllData)[2])]);
x3 <- paste(x3, "_L", sep="") #https://stackoverflow.com/questions/6984796/how-to-paste-a-string-on-each-element-of-a-vector-of-strings-using-apply-in-r
x4=gsub("(-[0-9]*)*.1", "", x4) #https://stackoverflow.com/questions/18997297/remove-ending-of-string-with-gsub
x4 <- paste(x4, "_S", sep="")# https://rdrr.io/bioc/qpgraph/man/qpNrr.html
x5a=x5[1:9]
x6=x5[10:length(x5)] #Dividing to lipids
x5=x5a #Making sure that PFAS are separate
nm = c(x1,x2,x3,x4,x5,x6); nm=c('PatientNumber','Gender','AGE','BMI','Steatosis Grade','Fibrosis Stage','Necroinflammation','HOMA-IR',nm[9:length(nm)])
colnames(AllData)=nm; #AllData[1:5,1:30]; #NonAlcoholicFattyLiverDisease[1:2,1:28];
colnames(AllData)[colnames(AllData)=='MetabolicAssociatedLiverDisease[, \"PFAS\"]']='PFAS';
# This (deletion) is good to do after all the previous:
x5=x5[x5!='PFAS'];x5=x5[x5!='Perfluorodecyl.ethanoic.acid']; x6=x6[x6!='Total_TG'] # x1;x2;x3;x4;x5;
AllData=AllData[,!colnames(AllData) %in% c('Total_TG','PFAS',"Perfluorodecyl.ethanoic.acid")]
# In case you would need just the logged values:
tv_half_log22=cbind(CombinedData[,1],Log2TransformedData);
x1=colnames(tv_half_log22[,c(1:8)]); v2=dim(NonAlcoholicFattyLiverDisease)[2]+1
x2=colnames(tv_half_log22[,9:v2]);v3=(dim(bileAcidsLiverData)[2]+v2);
x3=colnames(tv_half_log22[,(v2+1):(v3)]);v4=(dim(serumBileAcidsData)[2])+v3
x3=x3[c(length(x3),1:(length(x3)-1))]
x4=colnames(tv_half_log22[,(v3+1):(v4-1)]);
x5=colnames(tv_half_log22[,(v4):(dim(tv_half_log22)[2])]);
x3 <- paste(x3, "_L", sep="")
#https://stackoverflow.com/questions/6984796/how-to-paste-a-string-on-each-element-of-a-vector-of-strings-using-apply-in-r
x4=gsub("(-[0-9]*)*.1", "", x4) #https://stackoverflow.com/questions/18997297/remove-ending-of-string-with-gsub
x4 <- paste(x4, "_S", sep="")# https://rdrr.io/bioc/qpgraph/man/qpNrr.html
x5a=x5[1:9]
x6=x5[10:length(x5)] #dividing to lipids
x5=x5a #making sure that PFAS are separate
nm = c(x1,x2,x3,x4,x5,x6); nm=c('PatientNumber','Gender','AGE','BMI','Steatosis Grade','Fibrosis Stage','Necroinflammation','HOMA-IR',nm[9:length(nm)])
colnames(tv_half_log22)=nm; #tv_half_log22[1:5,1:30]; #NonAlcoholicFattyLiverDisease[1:2,1:28];
colnames(tv_half_log22)[colnames(tv_half_log22)=='MetabolicAssociatedLiverDisease[, \"PFAS\"]']='PFAS';
# This (deletion) is good to do after all the previous:
x5=x5[x5!='PFAS'];x5=x5[x5!='Perfluorodecyl.ethanoic.acid']; x6=x6[x6!='Total_TG'] # x1;x2;x3;x4;x5;
tv_half_log22=tv_half_log22[,!colnames(tv_half_log22) %in% c('Total_TG','PFAS',"Perfluorodecyl.ethanoic.acid")]
# This needs to be done early on:
colnames(CombinedData)[colnames(CombinedData)=='17aOH-P4']='17a-OHP4'
colnames(tv_half_log22)[colnames(tv_half_log22)=='17aOH-P4']='17a-OHP4'
colnames(AllData)[colnames(AllData)=='17aOH-P4']='17a-OHP4'
AllData=AllData[,!colnames(AllData) %in% c('Total_TG','PFAS','Perfluorodecyl.ethanoic.acid')]
AllData=AllData[,!colnames(AllData) %in% x4]
# In case you would need nonscaled covariates and scaled/logged all other variables:
CovariatesScaledData=AllData
CovariatesNonScaledData=cbind(CombinedData[,1:8],AllData[,9:dim(AllData)[2]])
LogCovariatesScaledData=tv_half_log22
LogCovariatesNonScaledData=cbind(CombinedData[,1:8],tv_half_log22[,9:dim(tv_half_log22)[2]])
colnames(CovariatesNonScaledData)[1:8]=colnames(AllData)[1:8]
colnames(LogCovariatesNonScaledData)[1:8]=colnames(AllData)[1:8]
# This is needed occasionally:
CurrentData=CovariatesScaledData
# https://stackoverflow.com/questions/6984796/how-to-paste-a-string-on-each-element-of-a-vector-of-strings-using-apply-in-r
# https://stackoverflow.com/questions/18997297/remove-ending-of-string-with-gsub # https://rdrr.io/bioc/qpgraph/man/qpNrr.html
hupo=match( colnames(CurrentData)[colnames(CurrentData) %in% groups2[,3]], groups2[,3] )
# do ni; https://www.geeksforgeeks.org/how-to-find-index-of-element-in-vector-in-r/
tvauxe=CurrentData
colnames(CurrentData)[colnames(CurrentData) %in% groups2[,3]]=groups2[hupo,2]
# This needs to be done also soon, to gather all the treatment etc. variable names separately...:
TreatmentVariables=colnames(AllData)[52:58];
MediatorVariables=colnames(AllData)[9:28];
OutcomeVariables=colnames(AllData)[c(29:51,59:71)]; ##https://sparkbyexamples.com/r-programming/r-remove-from-vector-with-examples/
OutcomeVariables=OutcomeVariables[!OutcomeVariables %in% c('Total_TG','PFAS','Perfluorodecyl.ethanoic.acid')]
OutcomeVariables=OutcomeVariables[! OutcomeVariables %in% x4] #https://sparkbyexamples.com/r-programming/r-remove-from-vector-with-examples/
MediatorVariables[MediatorVariables=="17aOH-P4"]="17a-OHP4"
TreatmentVariables=TreatmentVariables[!TreatmentVariables %in% c('Perfluorodecyl.ethanoic.acid')]
colnames(CovariatesScaledData)[colnames(CovariatesScaledData)=='C4Data_L']='C4_L'
colnames(CurrentData)[colnames(CurrentData)=='C4Data_L']='C4_L'
colnames(tvauxe)[colnames(tvauxe)=='C4Data_L']='C4_L'
colnames(AllData)[colnames(AllData)=='C4Data_L']='C4_L'
colnames(NonAlcoholicFattyLiverDisease)[colnames(NonAlcoholicFattyLiverDisease)=='C4Data_L']='C4_L'
x3[x3=='C4Data_L']='C4_L'
# Let's introduce some randomization. First, let us define the range of the random numbers:
# min_val <- 0.83; max_val <- 1.17
#
# # Then a function for the randomization:
# multiply_with_random <- function(df, cols, min_val, max_val) {
# for (col in cols) {
# if (col %in% names(df)) {
# df[[col]] <- df[[col]] * runif(nrow(df), min = min_val, max = max_val)
# } else {
# warning(paste("Column", col, "does not exist in the data frame"))}}
# return(df)}
#
# CovariatesScaledData=multiply_with_random(CovariatesScaledData, 3:(dim(CovariatesScaledData)[2]), min_val, max_val)
# CurrentData=multiply_with_random(CurrentData, 3:(dim(CurrentData)[2]), min_val, max_val)
# tvauxe=multiply_with_random(tvauxe, 3:(dim(tvauxe)[2]), min_val, max_val)
# AllData=multiply_with_random(AllData, 3:(dim(AllData)[2]), min_val, max_val)
# Just in case:
# save.image('thereal_v2.RData')
# setwd("C:/Users/patati/Documents/GitHub/Steroid_Data_Analysis")
# Got some help for commenting and variable namings from Copilot to above code.
# Also multiply_with_random was done together with Copilot.
Setting Global Variables
options(scipen = 999) # Disable scientific notation
# rm(list = ls()) # Clear workspace; this should not be if you have the load above
thedate <- strftime(Sys.Date(), "%d%m%y") #Do not take the old date from the load...
date <- paste0('tikka', thedate) # Customize this as needed
knitr::opts_knit$set(root.dir = 'C:/Users/patati/Documents/GitHub/Steroid_Data_Analysis') #Is this working? Yes, but in knitr:https://forum.posit.co/t/setting-working-directory-in-rmarkdown/70849
# Example installation commands
# remotes::install_github("fossbert/binilib", force=TRUE)
# install.packages(c('tidyverse', 'tibble'))
# if (!require("BiocManager", quietly = TRUE)) install.packages("BiocManager")
# BiocManager::install("Maaslin2")
# devtools::install_github("davidsjoberg/ggforestplot")
# remotes::install_version("insight", version = "0.20.5", repos = "http://cran.us.r-project.org", force=TRUE)
# font_import() # Import fonts if not already done
# loadfonts(device = "win") # Load fonts for Windows
# renv::status() # Check renv status
# library(rmarkdown); render("path/to/file.Rmd") # Render R Markdown document
# remove.packages("DelayedArray")
# BiocManager::install("DelayedArray")
# install.packages("Require") # Install 'Require' package
# If the installation to the project file is problematic, do it to the general folder and add it to the .libPaths(), as above library import.
Making Boxplots
#https://r-graph-gallery.com/265-grouped-boxplot-with-ggplot2.html
#https://stackoverflow.com/questions/53724834/why-does-the-plot-size-differ-between-docx-and-html-in-rmarkdownrender
windowsFonts(Arial = windowsFont("Arial"))
CreateBoxplots <- function(tvt, Group, OutcomeVariables, Out, oute, other) {
# Filter data based on gender
tvt <- tvt %>%
filter(if (Group == 'Male') Gender == 2 else if (Group == 'Female') Gender == 1 else TRUE)
# Prepare data for plotting
Steroid <- rep(colnames(tvt[, 9:28]), each = nrow(tvt))
data2 <- rep('Control', nrow(tvt))
num <- ifelse(OutcomeVariables == 'HOMA-IR', 1.5, min(tvt[[OutcomeVariables]]))
data2[tvt[[OutcomeVariables]] > num] <- 'Case'
TreatmentVariables <- data2
Concentration <- as.vector(unlist(tvt[, 9:28]))
data <- data.frame(Steroid, TreatmentVariables, Concentration)
data$Group <- 0
# Correct steroid names if the level exists
if ("17aOH-P4" %in% levels(data$Steroid)) {
data <- data %>%
mutate(Steroid = fct_recode(Steroid, '17a-OHP4' = '17aOH-P4'))
}
groups3[groups3[,1]=='Estrogens',1] ='Estrog.' #''
# Assign steroidGroups
rownames(groups3) <- 1:20
for (i in seq_len(nrow(groups3))) {
data[data$Steroid %in% groups3$Abbreviation[i], 'Group'] <- groups3$Group[i]
}
# Set plot title
title <- paste(Out, "'s Effect on Concentrations of Steroids in ", Group, sep = "")
# Define legend labels
e1 <- ifelse(num == 1.5, paste('Case (>', num, ')', sep = ""), paste('Case (>', 0, ')', sep = ""))
e2 <- ifelse(num == 1.5, paste('Control (<=', num, ')', sep = ""), paste('Control (=', 0, ')', sep = ""))
# Remove rows with NA concentrations
data <- data %>% filter(!is.na(Concentration))
# Create a named vector for mapping (this and the below line was as per Copilot)
abbreviation_map <- setNames(groups3$Abbreviation_old, groups3$Abbreviation)
# Map the Steroid values to Abbreviation_old values
data$Abbreviation_old <- abbreviation_map[data$Steroid]
colnames(data)[1]='BigName'
colnames(data)[5]='Steroid'
data=data[,c(5,2,3,4)]
# Create boxplot
# Create boxplot
# Create boxplot
# p <- ggplot(data, aes(x = Steroid, y = Concentration, fill = TreatmentVariables)) +
# geom_boxplot(notch = FALSE, notchwidth = 0.5, outlier.shape = 1, outlier.size = 2, coef = 1.5, colour = "") + #grey50
# theme_classic2() +
# theme(
# axis.text.x = element_text(angle = 90, hjust = 0.95, vjust = 0.2, size = 16),
# axis.text = element_text(color = "black"),
# panel.grid.minor = element_blank(),
# text = element_text(size = 16, family = "Times New Roman"),
# axis.title = element_text(size = 16),
# plot.title = element_text(size = 16),
# legend.text = element_text(size = 16),
# legend.title = element_text(size = 16)
# ) +
# labs(x = "Steroids", y = "Log2 of Picomolar Concentrations", title = title) +
# scale_fill_manual(values = c("orange", "blue"), name = oute, labels = c(e1, e2)) +
# facet_grid(~Group, scales = "free_x", space = "free") +
# stat_compare_means(
# hide.ns = TRUE, label = "p.signif", method = "wilcox.test",
# symnum.args = list(
# cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
# symbols = c("****", "***", "**", "*", "ns")
# ),
# size = 8, paired = FALSE, label.y = 15.5
# )
library(ggplot2)
library(ggpubr) # for stat_compare_means
library(ggthemes) # for theme_classic2
library(extrafont) # if needed for Times New Roman
# p <- ggplot(data, aes(x = Steroid, y = Concentration, fill = TreatmentVariables)) +
# geom_boxplot(
# notch = FALSE, notchwidth = 0.5,
# outlier.shape = 1, outlier.size = 2,
# coef = 1.5
# # colour = ""
# ) +
# theme_classic2(base_family = "Times New Roman") +
# theme(
# axis.text.x = element_text(angle = 90, hjust = 0.95, vjust = 0.2, size = 16, family = "Times New Roman"),
# axis.text = element_text(color = "black", family = "Times New Roman"),
# axis.title.x = element_text(size = 16, face = "bold", family = "Times New Roman"),
# axis.title.y = element_text(size = 16, face = "bold", family = "Times New Roman"),
# plot.title = element_text(size = 16, face = "bold", family = "Times New Roman"),
# legend.text = element_text(size = 16, family = "Times New Roman"),
# legend.title = element_text(size = 16, face = "bold", family = "Times New Roman"),
# strip.text = element_text(size = 16, face = "bold", family = "Times New Roman"),
# strip.background = element_rect(fill = "lightblue", color = "black"), # customize color here
# panel.grid.minor = element_blank()
# ) +
# labs(
# x = "Steroids",
# y = "Log2 of Picomolar Concentrations",
# title = title
# ) +
# scale_fill_manual(
# values = c("orange", "blue"),
# name = oute,
# labels = c(e1, e2)
# ) +
# facet_grid(~Group, scales = "free_x", space = "free") +
# stat_compare_means(
# hide.ns = TRUE, label = "p.signif", method = "wilcox.test",
# symnum.args = list(
# cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
# symbols = c("****", "***", "**", "*", "ns")
# ),
# size = 8, paired = FALSE, label.y = 15.5
# )
p <- ggplot(data, aes(x = Steroid, y = Concentration, fill = TreatmentVariables)) +
geom_boxplot(
notch = FALSE,
outlier.shape = 1,
outlier.size = 2,
coef = 1.5
) +
theme_classic2(base_family = "Arial") +
theme(
axis.text.x = element_text(angle = 90, hjust = 0.95, vjust = 0.2, size = 16),
axis.text.y = element_text(size = 16),
axis.title = element_text(size = 16, face = "bold"),
plot.title = element_text(size = 16, face = "bold"),
legend.text = element_text(size = 16),
legend.title = element_text(size = 16, face = "bold"),
strip.text = element_text(size = 16, face = "bold"),
strip.background = element_rect(fill = "lightblue", color = "black")
) +
labs(
x = "Steroids",
y = "Log2 of Picomolar Concentrations",
title = title
) +
scale_fill_manual(
values = c("orange", "blue"),
name = oute,
labels = c(e1, e2)
) +
facet_grid(~Group, scales = "free_x", space = "free") +
stat_compare_means(
hide.ns = TRUE,
label = "p.signif",
method = "wilcox.test",symnum.args = list(
cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
symbols = c("****", "***", "**", "*", "ns")
),
size = 8,paired = FALSE, label.y = 15.3,
)
# Define output path
path <- "C:/Users/patati/Documents/GitHub/Steroid_Data_Analysis/"
fs::dir_create(path) # Ensure directory exists
# === Extract statistics table (same tests used in stat_compare_means) ===
stats_tbl <- ggpubr::compare_means(
formula = Concentration ~ TreatmentVariables,
group.by = "Steroid",
data = data,
method = "wilcox.test"
)
# Define statistics output path
stats_file <- fs::path(path, paste0(Group, "_", Out, "_stats.csv"))
# Save stats table
write.csv(stats_tbl, stats_file, row.names = FALSE)
message("✅ Stats table saved: ", stats_file)
# === WILCOXON TEST + SAVE CSV ===
library(rstatix)
stats_tbl <- data %>%
group_by(Steroid) %>%
wilcox_test(Concentration ~ TreatmentVariables, paired = FALSE) %>%
adjust_pvalue(method = "BH") %>% # FDR correction
add_significance("p.adj") # significance stars
stats_file <- fs::path(path, paste0(Group, "_", Out, "_wilcox_stats.csv"))
write.csv(stats_tbl, stats_file, row.names = FALSE)
message("✅ Wilcoxon stats saved: ", stats_file)
# Save as PNG
pngfile <- fs::path(path, paste0(Group, Out, "BoxB", ".png"))
try({
ragg::agg_png(filename = pngfile, width = 60, height = 36, units = "cm", res = 300, scaling = 2)
print(p)
dev.off()
if (file.exists(pngfile)) {
message("✅ PNG saved successfully: ", pngfile)
} else {
warning("❌ PNG file not found after saving.")
}
}, silent = FALSE)
# Save as JPEG
jpgfile <- fs::path(path, paste0(Group, Out, "BoxB", ".jpg"))
try({
if (requireNamespace("ragg", quietly = TRUE) && "agg_jpeg" %in% getNamespaceExports("ragg")) {
ragg::agg_jpeg(filename = jpgfile, width = 60, height = 36, units = "cm", res = 300, scaling = 2, quality = 95)
} else {
grDevices::jpeg(filename = jpgfile, width = 60, height = 36, units = "cm", res = 300, quality = 95)
}
print(p)
dev.off()
if (file.exists(jpgfile)) {
message("✅ JPEG saved successfully: ", jpgfile)
} else {
warning("❌ JPEG file not found after saving.")
}
}, silent = FALSE)
# ===== SAVE REAL SVG (VECTOR) =====
svgfile <- fs::path(path, paste0(Group, Out, "BoxB", ".svg"))
svglite::svglite(
file = svgfile,
width = 60 / 2.54, # cm → inches
height = 36 / 2.54
)
print(p)
dev.off()
if (file.exists(svgfile)) {
message("✅ SVG saved successfully: ", svgfile)
} else {
warning("❌ SVG file not found after saving.")
}
# Convert PNG to PDF
# pdffile <- fs::path(path, paste0(Group, Out, "BoxB", ".pdf"))
# try({
# daiR::image_to_pdf(files = as.character(pngfile), pdf_name = as.character(pdffile))
# if (file.exists(pdffile)) {
# message("✅ PDF created successfully: ", pdffile)
# } else {
# warning("❌ PDF file not found after conversion.")
# }
# }, silent = FALSE)
#
# # Optionally include PNG in document (e.g., RMarkdown)
# if (file.exists(pngfile)) {
# knitr::include_graphics(pngfile)
# }
# ==== REAL PDF (VECTOR) ====
pdffile <- fs::path(path, paste0(Group, Out, "BoxB.pdf"))
cairo_pdf(pdffile, width = 60/2.54, height = 36/2.54)
print(p)
dev.off()
}
# Example usage
LogCovariatesNonScaledData[, '11-KA4'][LogCovariatesNonScaledData[, '11-KA4'] ==
min(LogCovariatesNonScaledData[, '11-KA4'])] <- median(LogCovariatesNonScaledData[, '11-KA4'])
# other <- '261124'
ie <- LogCovariatesNonScaledData#
# This is partly auxiliary
# setwd("C:/Users/patati/Desktop/Turku/R")
# or:"C:/Users/patati/Desktop/Turku/R"
# groups2=data.frame(steroidNames)[,1:4]; g1=read.csv("C:/Users/patati/Desktop/Turku/R/groups_17823.csv", header = TRUE, sep=";")
# groups2=cbind(g1[,'Group'], groups2[,c('Abbreviation','Abbreviation_old','Name')])
# groups2=groups2[groups2[,'Abbreviation']!='FF',]; colnames(groups2)[1]="Group";
# groups2=groups2[order(groups2[,'Group']),]
# groups2[,'Group'][groups2[,'Abbreviation_old']=='DOC']='Mineralocorticoids'
#
# groups2[,'Abbreviation'][groups2[,'Abbreviation']=='17aOH-P4']='17a-OHP4'
# #The Further Abbreviations:
groups3=groups2
groups3[groups3[,1]=='Glucocorticoids',1] ='Glucoc.' #'Glucoc.' G.C.'
groups3[groups3[,1]=='Mineralocorticoids',1] ='Mineralc.' #'Miner.c.' M.C.
#groups2[groups2[,1]=='Progestogens',1] ='P.G.' #''
groups2[groups2[,1]=='Estrogens',1] ='Estrog.' #''
setwd("C:/Users/patati/Documents/GitHub/Steroid_Data_Analysis")
hupo <- match(colnames(ie)[colnames(ie) %in% groups2[, 3]], groups2[, 3])
colnames(ie)[colnames(ie) %in% groups2[, 3]] <- groups2[hupo, 2]
windowsFonts(A = windowsFont("Calibri (Body)"))
knitr::opts_knit$set(root.dir = 'C:/Users/patati/Documents/GitHub/Steroid_Data_Analysis')
# The significance levels are: '****<0.001', '***<0.01', '**<0.05', '*<0.1'
OutcomeVariables='Steatosis Grade';Out='Steatosis'; oute='Steatosis Grade';num=0;Group='All';
CreateBoxplots(ie,Group,OutcomeVariables,Out,oute,other);
## png
## 2
Group='Female';
CreateBoxplots(ie,Group,OutcomeVariables,Out,oute,other);Group='Male';
## png
## 2
CreateBoxplots(ie,Group,OutcomeVariables,Out,oute,other)
## png
## 2
OutcomeVariables='Fibrosis Stage';Out='Fibrosis'; oute='Fibrosis Stage';num=0;Group='All';
CreateBoxplots(ie,Group,OutcomeVariables,Out,oute,other);Group='Female';
## png
## 2
CreateBoxplots(ie,Group,OutcomeVariables,Out,oute,other);Group='Male';
## png
## 2
CreateBoxplots(ie,Group,OutcomeVariables,Out,oute,other)
## png
## 2
# https://www.elsevier.es/en-revista-annals-hepatology-16-articulo-assessment-hepatic-fibrosis-necroinflammation-among-S1665268119314590 #So it is in grade
OutcomeVariables='Necroinflammation';Out='Necroinflammation'; oute='Necroinflammation Grade';num=0;Group='All';
CreateBoxplots(ie,Group,OutcomeVariables,Out,oute,other);Group='Female';
## png
## 2
CreateBoxplots(ie,Group,OutcomeVariables,Out,oute,other);Group='Male';
## png
## 2
CreateBoxplots(ie,Group,OutcomeVariables,Out,oute,other)
## png
## 2
OutcomeVariables='HOMA-IR';Out='HOMA-IR'; oute='HOMA-IR';num=1.5;Group='All';
CreateBoxplots(ie,Group,OutcomeVariables,Out,oute,other);Group='Female';
## png
## 2
CreateBoxplots(ie,Group,OutcomeVariables,Out,oute,other);Group='Male';
## png
## 2
CreateBoxplots(ie,Group,OutcomeVariables,Out,oute,other)
## png
## 2
Making Forest Plots
# Define the NonAlcoholicFattyLiverDisease dataset by selecting the first 28 columns from CombinedData
NonAlcoholicFattyLiverDisease <- CombinedData[, 1:28]
# Convert specific columns to binary values using vectorized operations
cols_to_binary <- c(5, 6, 7)
NonAlcoholicFattyLiverDisease[, cols_to_binary] <- (NonAlcoholicFattyLiverDisease[, cols_to_binary] > 0) * 1
# Convert column 8 to binary based on the threshold of 1.5
NonAlcoholicFattyLiverDisease[, 8] <- (NonAlcoholicFattyLiverDisease[, 8] > 1.5) * 1
# Clean column names to remove special characters and make uniqueBMIValues consistent
patterns <- c("-", "/", "11", "17", "#")
replacements <- c(".", ".", "X11", "X17", ".") #?
# Ensure patterns and replacements are correctly paired
if (length(patterns) == length(replacements)) {
for (i in seq_along(patterns)) {
colnames(NonAlcoholicFattyLiverDisease) <- gsub(patterns[i], replacements[i], colnames(NonAlcoholicFattyLiverDisease))}} else {
stop("Patterns and replacements vectors must be of the same length.")}
#' CalculateErrors
#' Builds ratio (case/control) with log CI, maps groups via groups2, creates forest plot, saves images.
#' @param NonAlcoholicFattyLiverDisease data.frame
#' @param OutcomeVariables character; binary outcome column in NAFLD data (0 vs >0)
#' @param Group character; "Male" | "Female" | "All"
#' @param name character; output file stem (e.g., "MyPlot")
#' @param ordera character vector of abbreviations to order rows; if NULL, inferred from groups2
#' @param oute (kept for compatibility; unused in this function)
#' @param first logical; when Group=="All" and first==TRUE, reverse order
#' @param e character; reference analyte used for ratio construction (kept from your code)
#' @param xlim numeric length-2 or NULL; x axis limits (log scale). If NULL, inferred from data
#' @param groups2 data.frame with columns: Group, Abbreviation, Abbreviation_old, Name
#' @return ordera vector used for the y order
CalculateErrors <- function(NonAlcoholicFattyLiverDisease,
OutcomeVariables,
Group,
name,
ordera = NULL,
oute = NULL,
first = FALSE,
e,
xlim = NULL,
groups2) {
## ---- Safety checks ----
stopifnot(is.data.frame(NonAlcoholicFattyLiverDisease))
if (!OutcomeVariables %in% colnames(NonAlcoholicFattyLiverDisease)) {
stop("OutcomeVariables='", OutcomeVariables, "' not found in data.")
}
if (!'SEX.1F.2M' %in% colnames(NonAlcoholicFattyLiverDisease)) {
stop("'SEX.1F.2M' column not found in data.")
}
needed_g2 <- c("Group", "Abbreviation", "Abbreviation_old", "Name")
if (!all(needed_g2 %in% colnames(groups2))) {
stop("groups2 must contain columns: ", paste(needed_g2, collapse = ", "))
}
## ---- Helper: name normalization used for matching ----
norm_key <- function(x) {
x <- toupper(as.character(x))
x <- gsub("^X(?=[0-9])", "", x, perl = TRUE) # drop leading X before digits
gsub("[^A-Z0-9]", "", x) # strip non-alphanumerics
}
## ---- 1) Filter by Group ----
NAFLDo <- switch(Group,
"Male" = NonAlcoholicFattyLiverDisease[NonAlcoholicFattyLiverDisease[, 'SEX.1F.2M'] == 2, ],
"Female" = NonAlcoholicFattyLiverDisease[NonAlcoholicFattyLiverDisease[, 'SEX.1F.2M'] == 1, ],
"All" = NonAlcoholicFattyLiverDisease,
stop("Group must be one of 'Male', 'Female', 'All'."))
if (nrow(NAFLDo) == 0) stop("No rows after filtering by Group = '", Group, "'.")
## ---- 2) Find analyte (steroid) columns dynamically via groups2 ----
keys <- unique(na.omit(c(groups2$Abbreviation, groups2$Abbreviation_old, groups2$Name)))
keys_n <- norm_key(keys)
cn <- colnames(NAFLDo)
cn_n <- norm_key(cn)
match_idx <- which(cn_n %in% keys_n)
measure_cols <- cn[match_idx]
# Exclude non-measure columns
measure_cols <- setdiff(measure_cols, c(OutcomeVariables, "SEX.1F.2M"))
# Keep numeric
measure_cols <- measure_cols[sapply(measure_cols, function(v) is.numeric(NAFLDo[[v]]))]
if (length(measure_cols) == 0) {
stop(
"Could not find any analyte columns by matching groups2 to your data.\n",
"Check that your data column names correspond to groups2 Abbreviation/Abbreviation_old/Name."
)
}
## ---- 3) Split by outcome and summarize ----
sample_data <- vector("list", 2)
n0 <- n1 <- 0
for (i in 1:2) {
SG0 <- if (i == 1) {
NAFLDo[NAFLDo[[OutcomeVariables]] == 0, , drop = FALSE]
} else {
NAFLDo[NAFLDo[[OutcomeVariables]] > 0, , drop = FALSE]
}
if (i == 1) n0 <- nrow(SG0) else n1 <- nrow(SG0)
if (nrow(SG0) == 0) stop("Outcome split ", i, " has zero rows. Check '", OutcomeVariables, "' coding.")
means <- sapply(measure_cols, function(v) median(SG0[[v]], na.rm = TRUE))
sds <- sapply(measure_cols, function(v) sd(SG0[[v]], na.rm = TRUE))
sample_data[[i]] <- data.frame(
study = measure_cols,
index = measure_cols,
result = as.numeric(means),
error = as.numeric(sds),
stringsAsFactors = FALSE
)
}
# Merge (control arm i=1 with case arm i=2) by analyte name
df <- merge(sample_data[[1]], sample_data[[2]], by = c("study", "index"), suffixes = c("", ".1"))
## ---- 4) Wilcoxon p-values per analyte (vector interface, avoids formula pitfalls) ----
ov <- NAFLDo[[OutcomeVariables]]
ctrl_rows <- ov == 0
case_rows <- ov > 0
ps <- sapply(measure_cols, function(v) {
x <- NAFLDo[ctrl_rows, v]
y <- NAFLDo[case_rows, v]
x <- x[is.finite(x)]
y <- y[is.finite(y)]
if (length(x) < 1 || length(y) < 1) return(NA_real_)
tryCatch(wilcox.test(x, y, exact = FALSE)$p.value, error = function(e) NA_real_)
})
names(ps) <- measure_cols
## ---- 5) Ratio/log values & cleanup ----
# Keep your 'e' reference ratio (not used downstream, but retained)
suppressWarnings({
a <- tryCatch(
df[df[, "study"] == e, 'result.1'] / df[df[, "study"] == e, 'result'],
error = function(e) NA_real_
)
})
ResComb <- data.frame(log(df$result.1 / df$result))
ResComb$result <- ResComb[, 1]
ResComb$name_raw <- df$study
ResComb <- ResComb[, c("result", "name_raw")]
# Pretty names for plotting (match your prior cleanup)
ResComb$name <- ResComb$name_raw
ResComb$name <- gsub("\\.", "-", ResComb$name)
ResComb$name <- sub("^X11", "11", ResComb$name)
ResComb$name <- sub("^X17", "17", ResComb$name)
ResComb$name[ResComb$name == "T-Epi-T"] <- "T/Epi-T"
# Attach p‑values by original raw colname
ResComb$pval <- ps[match(ResComb$name_raw, names(ps))]
## ---- 6) Error / CI on ratio (as in your code) ----
ResComb$result_pure <- df$result.1 / df$result
ResComb$error <- (abs((1 / df$result) * df$error.1) +
abs((df$result.1 / (df$result^2)) * df$error)) / nrow(NAFLDo) * 1.64
# Cap extremes
medE <- median(ResComb$error, na.rm = TRUE)
sdE <- sd(ResComb$error, na.rm = TRUE)
ResComb$error <- ifelse(ResComb$error > (medE + sdE), medE * 1.25, ResComb$error)
# Bounds + log transform (guard negative/zero to avoid -Inf)
ResComb$errord1a <- pmax(ResComb$result_pure - ResComb$error, .Machine$double.eps)
ResComb$errord2a <- pmax(ResComb$result_pure + ResComb$error, .Machine$double.eps)
ResComb$errord1 <- log(ResComb$errord1a)
ResComb$errord2 <- log(ResComb$errord2a)
ResComb$result <- log(ResComb$result_pure)
ResComb$Control <- df$result
ResComb$Case <- df$result.1
# P-values & flags
ResComb$pval0 <- ResComb$pval
ResComb$pval1 <- ResComb$pval
ResComb$Significance0 <- ifelse(ResComb$pval0 < 0.1, 'Yes', 'No')
ResComb$Color0 <- ifelse(ResComb$pval0 < 0.1, 'blue', 'grey')
ResComb$Significance1 <- ifelse(ResComb$pval1 < 0.1, 'Yes', 'No')
ResComb$Color1 <- ifelse(ResComb$pval1 < 0.1, 'blue', 'grey')
## ---- 7) Map Group via groups2 (match by Abbrev / old / Name) ----
key_tbl <- unique(do.call(rbind, list(
data.frame(key = as.character(groups2$Abbreviation), Group = groups2$Group, stringsAsFactors = FALSE),
data.frame(key = as.character(groups2$Abbreviation_old), Group = groups2$Group, stringsAsFactors = FALSE),
data.frame(key = as.character(groups2$Name), Group = groups2$Group, stringsAsFactors = FALSE)
)))
key_tbl <- key_tbl[!is.na(key_tbl$key) & nzchar(key_tbl$key), ]
key_tbl$key_norm <- norm_key(key_tbl$key)
ResComb$key_norm <- norm_key(ResComb$name)
map_df <- merge(ResComb[, c("name", "key_norm")],
unique(key_tbl[, c("key_norm", "Group")]),
by = "key_norm", all.x = TRUE)
ResComb$Group <- map_df$Group[match(ResComb$key_norm, map_df$key_norm)]
# Warn on unmapped
unmapped <- sort(unique(ResComb$name[is.na(ResComb$Group)]))
if (length(unmapped)) {
warning("Some analytes could not be mapped to a group in groups2: ",
paste(unmapped, collapse = ", "))
}
# Facet order: as in groups2
group_levels <- unique(groups2$Group)
ResComb$Group <- factor(ResComb$Group, levels = group_levels)
## ---- 8) Order y-axis levels (names) ----
if (is.null(ordera)) {
g2_abbr <- groups2$Abbreviation
ordera_n <- norm_key(g2_abbr)
present <- g2_abbr[ordera_n %in% unique(ResComb$key_norm)]
ordera <- present
if (identical(Group, "All") && isTRUE(first)) ordera <- rev(ordera)
} else {
if (identical(Group, "All") && isTRUE(first)) ordera <- rev(ordera)
}
# Convert ordera (abbreviations) to displayed levels present in ResComb$name
ordera_norm <- norm_key(ordera)
display_levels <- ResComb$name[match(ordera_norm, ResComb$key_norm)]
display_levels <- display_levels[!is.na(display_levels)]
if (length(display_levels) == 0) display_levels <- ResComb$name
ResComb$name <- factor(ResComb$name, levels = unique(display_levels))
# Numeric group index for strip fills
ResComb$Group2 <- as.numeric(as.factor(ResComb$Group))
## ---- 9) xlim ----
if (is.null(xlim) || length(xlim) != 2 || any(!is.finite(xlim))) {
xlim <- range(c(ResComb$errord1, ResComb$errord2), na.rm = TRUE)
}
## ---- 10) Build forest plot ----
plote2 <- forestplot(
df = ResComb,
estimate = result,
se = 0,
pvalue = pval1,
psignif = 0.1,
xlim = xlim,
xlab = 'Logged Ratio between Raw Concentrations of Case and Control with 90% CI',
ylab = 'Steroid Groups',
title = '',
colour = Significance1
) +
# ggforce::facet_col(facets = ~Group, scales = "free_y", space = "free",strip.position = 'left') + #space = "free",
# ggforce::facet_col(facets = ~Group, scales = "free_y", strip.position = 'left')+
# facet_grid(
# Group ~ .,
# scales = "free_y",
# space = "free_y")+ #strip.position = 'left'
geom_errorbarh(aes(xmin = errord1, xmax = errord2, height = .0, colour = Significance1))
# Point color palette
hp <- if (sum(ResComb$Significance1 == 'Yes', na.rm = TRUE) == length(levels(ResComb$name))) {
c('blue', 'blue')
} else {
c('#999999', 'blue')
}
# Try stripes transparency (guard in case layer indexing differs)
try({ plote2$layers[[1]]$aes_params$odd <- "#00000000" }, silent = TRUE)
PlotVar <- plote2 + theme(axis.text.y = element_blank()) + theme_classic2()
PlotVar2 <- PlotVar +
geom_point(aes(colour = factor(Significance1)), colour = ResComb$Color1) +
scale_color_manual(values = hp) +
theme(legend.position = "none") +
theme(strip.text.y = element_text(size = -Inf))
# Customize facet strip colors
# g <- ggplot_gtable(ggplot_build(PlotVar2))
# stripr <- which(grepl('strip-l', g$layout$name))
# fills <- c("red", "green", "blue", "yellow", "brown")
# if (length(stripr) > 0) {
# fills <- rep(fills, length.out = length(stripr))
# for (i in seq_along(stripr)) {
# j <- tryCatch(which(grepl('rect', g$grobs[[stripr[i]]]$grobs[[1]]$childrenOrder)),
# error = function(e) integer(0))
# if (length(j) > 0) {
# g$grobs[[stripr[i]]]$grobs[[1]]$children[[j]]$gp$fill <- fills[i]
# }
# }
# }
jpeg(paste0(name, "divi.jpg"), width = 7500, height = 11000, quality = 100, pointsize = 16, res = 1000)
print(PlotVar2)
dev.off()
## ---- 11) Save outputs ----
setwd("C:/Users/patati/Documents/GitHub/Steroid_Data_Analysis")
# jpeg(paste0(name, "divi.jpg"), width = 7500, height = 11000, quality = 100, pointsize = 16, res = 1000)
# print(grid::grid.draw(g))
# dev.off()
# daiR::image_to_pdf(paste0(name, "divi.jpg"), pdf_name = paste0(paste0(name, "divi.jpg"), '.pdf'))
# my_image <- magick::image_read(paste0(name, "divi.jpg"))
# my_svg <- magick::image_convert(my_image, format = "svg")
# magick::image_write(my_svg, paste0(name, "divi.svg"))
#
# library(ggplot2)
# PDF (true vector)
ggsave(
filename = paste0(name, "divi.pdf"),
plot = PlotVar2,
device = cairo_pdf, # ensures embedded fonts + vector
width = 7.5,
height = 11,
units = "in"
)
# SVG (true vector)
ggsave(
filename = paste0(name, "divi.svg"),
plot = PlotVar2,
device = "svg",
width = 7.5,
height = 11,
units = "in"
)
return(ordera)
}
# This is with first(!!). Use it.
OutcomeVariables='Steatosis.Grade.0.To.3';Out='Steatosis'; oute='Steatosis';first=TRUE; e='P4';ordera=c();
Group='All';name1=paste("Forest plot of",Group, "Steroid Ratios in",Out);
hel=CalculateErrors(NonAlcoholicFattyLiverDisease,OutcomeVariables,Group,name1,ordera,oute,first,e,xlim, groups2)
# #Afterwards:
first=FALSE;
Group='Female';name2=paste("Forest plot of",Group, "Steroid Ratios in",Out);
CalculateErrors(NonAlcoholicFattyLiverDisease,OutcomeVariables,Group,name2,ordera,oute,first,e,xlim, groups2)
Group='Male'; name3=paste("Forest plot of",Group, "Steroid Ratios in",Out);
CalculateErrors(NonAlcoholicFattyLiverDisease,OutcomeVariables,Group,name3,ordera,oute,first,e,xlim, groups2)
#
OutcomeVariables='Fibrosis.Stage.0.to.4'; Out='Fibrosis';oute='Fibrosis';
Group='All'; name4=paste("Forest plot of",Group, "Steroid Ratios in",Out);
CalculateErrors(NonAlcoholicFattyLiverDisease,OutcomeVariables,Group,name4,ordera,oute,first,e,xlim, groups2)
Group='Female';name5=paste("Forest plot of",Group, "Steroid Ratios in",Out);
CalculateErrors(NonAlcoholicFattyLiverDisease,OutcomeVariables,Group,name5,ordera,oute,first,e,xlim, groups2)
Group='Male'; name6=paste("Forest plot of",Group, "Steroid Ratios in",Out);
CalculateErrors(NonAlcoholicFattyLiverDisease,OutcomeVariables,Group,name6,ordera=hel,oute,first,e,xlim, groups2)
#
OutcomeVariables='Necroinflammation'; Out='Necroinflammation';oute='Necroinflammation';
Group='All'; name7=paste("Forest plot of",Group, "Steroid Ratios in",Out);
CalculateErrors(NonAlcoholicFattyLiverDisease,OutcomeVariables,Group,name7,ordera=hel,oute,first,e,xlim, groups2) #not the very first though...
Group='Female';name8=paste("Forest plot of",Group, "Steroid Ratios in",Out);
CalculateErrors(NonAlcoholicFattyLiverDisease,OutcomeVariables,Group,name8,ordera=hel,oute,first,e,xlim, groups2)
Group='Male'; name9=paste("Forest plot of",Group, "Steroid Ratios in",Out);
CalculateErrors(NonAlcoholicFattyLiverDisease,OutcomeVariables,Group,name9,ordera=hel,oute,first,e,xlim, groups2)
#
OutcomeVariables='HOMA.IR';Out='HOMA-IR';oute='HOMAIR';
Group='All';name10=paste("Forest plot of",Group, "Steroid Ratios in",Out);
CalculateErrors(NonAlcoholicFattyLiverDisease,OutcomeVariables,Group,name10,ordera=hel,oute,first,e,xlim, groups2) #not the very first though...
Group='Female';name11=paste("Forest plot of",Group, "Steroid Ratios in",Out);
CalculateErrors(NonAlcoholicFattyLiverDisease,OutcomeVariables,Group,name11,ordera=hel,oute,first,e,xlim, groups2)
Group='Male'; name12=paste("Forest plot of",Group, "Steroid Ratios in",Out);
CalculateErrors(NonAlcoholicFattyLiverDisease,OutcomeVariables,Group,name12,ordera=hel,oute,first,e,xlim, groups2)
##Alternative calculation and plotting style, you need to have the linear model results saved as excel: #'lms_tikka19324_v2.xlsx'
library(readxl)
library(ggplot2)
library(ggh4x) # per-facet y-limits, themed strips, facet_grid2(..., space='free_y')
library(grid)
library(magick)
setwd("C:/Users/patati/Documents/GitHub/Steroid_Data_Analysis")
build_fixed_orders <- function(groups2, global_y_order = NULL) {
stopifnot(all(c("Group","Abbreviation_old") %in% colnames(groups2)))
fixed_group_levels <- unique(as.character(groups2$Group)) # facet row order
if (is.null(global_y_order)) {
global_y_order <- unique(as.character(groups2$Abbreviation_old))
} else {
global_y_order <- as.character(global_y_order)
}
group_to_abbr <- lapply(fixed_group_levels, function(gl) {
grp_set <- as.character(groups2$Abbreviation_old[groups2$Group == gl])
global_y_order[global_y_order %in% grp_set]
})
names(group_to_abbr) <- fixed_group_levels
list(
group_levels = fixed_group_levels,
group_to_abbr = group_to_abbr,
global_y_order = global_y_order
)
}
CalculateErrors <- function(
Group,
name,
groups2,
lm_path,
lm_sheet = "Steroids' adjusted lms",
lm_condition = NULL,
use_qval = FALSE,
alpha = 0.10,
se_multiplier = 0.5,
strict_gender = TRUE,
fixed_group_levels = NULL,
fixed_y_by_group = NULL,
global_y_order = NULL,
group_strip_fills = NULL,
shorten_long_group_names = TRUE,
strip_hjust = 0.50,
strip_vjust = 0.30,
strip_margin_left = 6,
strip_margin_right = 4,
axis_title_size = 14,
axis_text_size = 12,
strip_text_size = 12,
title_face = "plain",
text_face = "plain",
strip_face = "plain",
device_width_px = 8500,
device_height_px = NULL,
device_res = 800,
symmetric_x = TRUE,
x_breaks_n = 7,
x_nice_step = 0.05,
x_minor_width = 0.025,
x_expand = c(0.02, 0.02),
x_label_accuracy = 0.01,
x_title_top_margin = 12,
xlim = NULL,
output_dir = getwd(),
save_outputs = TRUE,
return_plot = FALSE,
...
)
{
# ================================================================
# Helpers
# ================================================================
# ✅ FIX FOR NULL HEIGHT
if (is.null(device_height_px)) {
device_height_px <- 8000
}
norm_key <- function(x) {
x <- toupper(as.character(x))
x <- gsub("^X(?=[0-9])","", x, perl = TRUE)
gsub("[^A-Z0-9]", "", x)
}
header_norm <- function(x) {
x <- tolower(as.character(x))
x <- gsub("[\r\n]+", " ", x); x <- gsub('"', "", x)
x <- gsub("\\s+", " ", x); x <- trimws(x)
gsub("[^a-z0-9]","", x)
}
parse_num <- function(x) {
if (is.numeric(x)) return(x)
x <- gsub("\\s+","", as.character(x))
x <- gsub(",", ".", x)
suppressWarnings(as.numeric(x))
}
infer_condition_from_name <- function(s) {
if (is.null(s)) return(NA_character_)
st <- tolower(s)
if (grepl("steatosis", st)) return("Steatosis")
if (grepl("fibrosis", st)) return("Fibrosis")
if (grepl("necro[- ]?inflam", st)) return("Necroinflammation")
if (grepl("homa", st)) return("HOMAIR")
NA_character_
}
normalize_cond <- function(s) gsub("[^a-z0-9]","", tolower(as.character(s)))
# ================================================================
# Safety checks
# ================================================================
stopifnot(is.data.frame(groups2))
req_cols <- c("Group","Abbreviation","Abbreviation_old","Name")
stopifnot(all(req_cols %in% colnames(groups2)))
if (!file.exists(lm_path)) stop("LM file not found: ", lm_path)
gender_token <- switch(tolower(Group),
"all"="both",
"male"="male",
"female"="female",
"both")
# ================================================================
# Read LM file
# ================================================================
lm_raw <- readxl::read_excel(lm_path, sheet = lm_sheet, .name_repair = "minimal")
if (nrow(lm_raw) == 0) stop("LM sheet empty.")
cn_norm <- header_norm(colnames(lm_raw))
idx_steroid <- which(cn_norm=="steroid")[1]
idx_gender <- which(cn_norm=="genderincondition")[1]
idx_cond <- which(cn_norm=="condition")[1]
idx_coef <- which(cn_norm=="coef")[1]
idx_se <- which(cn_norm=="stderr")[1]
idx_pval <- which(cn_norm=="pval")[1]
idx_qval <- which(cn_norm=="qval")[1]
LM <- data.frame(
Steroid = trimws(as.character(lm_raw[[idx_steroid]])),
GenderInCond = trimws(as.character(lm_raw[[idx_gender]])),
Condition = trimws(as.character(lm_raw[[idx_cond]])),
Coef = parse_num(lm_raw[[idx_coef]]),
Stderr = parse_num(lm_raw[[idx_se]]),
Pval = if (!is.na(idx_pval)) parse_num(lm_raw[[idx_pval]]) else NA_real_,
Qval = if (!is.na(idx_qval)) parse_num(lm_raw[[idx_qval]]) else NA_real_,
stringsAsFactors = FALSE
)
LM <- LM[is.finite(LM$Coef) & is.finite(LM$Stderr), ]
# ================================================================
# Condition & gender filtering
# ================================================================
if (is.null(lm_condition)) {
lm_condition <- infer_condition_from_name(name)
if (is.na(lm_condition)) {
conds <- sort(unique(LM$Condition))
if (length(conds) == 1) lm_condition <- conds[1]
else stop("Multiple conditions; specify lm_condition.")
}
}
rows_cond <- normalize_cond(LM$Condition) == normalize_cond(lm_condition)
gic_token <- sub(".*_", "", tolower(LM$GenderInCond))
rows_gender <- gic_token == gender_token
LM_sel <- LM[rows_cond & rows_gender, ]
if (nrow(LM_sel) == 0) {
if (strict_gender) stop("No LM rows for gender=", gender_token)
LM_sel <- LM[rows_cond, ]
}
# ================================================================
# Build data for plotting
# ================================================================
p_used <- if (use_qval && any(is.finite(LM_sel$Qval))) LM_sel$Qval else LM_sel$Pval
Res <- data.frame(
name = LM_sel$Steroid,
result = LM_sel$Coef,
se_lm = LM_sel$Stderr,
pval1 = p_used,
stringsAsFactors = FALSE
)
# ================================================================
# Mapping analyte names to groups2
# ================================================================
map_tbl <- do.call(rbind, list(
data.frame(alias=groups2$Abbreviation_old, Group=groups2$Group, AbbrevOld=groups2$Abbreviation_old),
data.frame(alias=groups2$Abbreviation, Group=groups2$Group, AbbrevOld=groups2$Abbreviation_old),
data.frame(alias=groups2$Name, Group=groups2$Group, AbbrevOld=groups2$Abbreviation_old)
))
map_tbl <- map_tbl[!is.na(map_tbl$alias) & nzchar(map_tbl$alias), ]
map_tbl$key_norm <- norm_key(map_tbl$alias)
Res$key_norm <- norm_key(Res$name)
MapDF <- merge(
Res[,c("name","key_norm")],
unique(map_tbl[,c("key_norm","Group","AbbrevOld")]),
by="key_norm", all.x=TRUE
)
Res$Group <- MapDF$Group
Res$AbbrevOld <- MapDF$AbbrevOld
# Drop unmapped
drop_mask <- is.na(Res$Group) | is.na(Res$AbbrevOld)
if (any(drop_mask)) {
warning("Dropped unmapped analytes: ",
paste(sort(unique(Res$name[drop_mask])), collapse=", "))
Res <- Res[!drop_mask, ]
}
# ================================================================
# Error bars
# ================================================================
eb <- se_multiplier * Res$se_lm
Res$errord1 <- Res$result - eb
Res$errord2 <- Res$result + eb
Res$Significance1 <- ifelse(Res$pval1 < alpha, "Yes", "No")
# ================================================================
# Fixed ordering logic
# ================================================================
if (is.null(fixed_group_levels) || is.null(fixed_y_by_group)) {
fixed_group_levels <- unique(as.character(groups2$Group))
if (is.null(global_y_order)) {
global_y_order <- unique(as.character(groups2$Abbreviation_old))
}
fixed_y_by_group <- lapply(fixed_group_levels, function(gl){
grp_set <- groups2$Abbreviation_old[groups2$Group == gl]
global_y_order[global_y_order %in% grp_set]
})
names(fixed_y_by_group) <- fixed_group_levels
}
# label shortening
shorten_label <- function(x) {
x <- as.character(x)
if (shorten_long_group_names)
x <- sub("^Glucocorticoids$", "Glucoc.", x)
x
}
group_label_map <- setNames(shorten_label(fixed_group_levels), fixed_group_levels)
Res$Group <- factor(Res$Group, levels=fixed_group_levels)
groups_frame <- do.call(rbind, lapply(fixed_group_levels, function(gl) {
data.frame(
Group = factor(gl, levels=fixed_group_levels),
GroupDisp = factor(group_label_map[[gl]], levels=group_label_map[fixed_group_levels]),
AbbrevOld = fixed_y_by_group[[gl]],
stringsAsFactors = FALSE
)
}))
ResFull <- merge(groups_frame, Res,
by=c("Group","AbbrevOld"),
all.x=TRUE, sort=FALSE)
# ================================================================
# X limits
# ================================================================
nice_up <- function(z, step=0.05) step * ceiling(z / step)
if (is.null(xlim)) {
if (symmetric_x) {
max_abs <- max(abs(ResFull$errord1), abs(ResFull$errord2), na.rm=TRUE)
xmax <- nice_up(max_abs, step=x_nice_step)
xlim <- c(-xmax, xmax)
} else {
xlim <- range(c(ResFull$errord1, ResFull$errord2), na.rm=TRUE)
}
}
# ================================================================
# Strip color fix — theme() instead of strip_themed
# ================================================================
if (is.null(group_strip_fills)) {
base_cols <- c("#1f77b4","#d62728","#2ca02c","#9467bd","#ff7f0e")
group_strip_fills <- setNames(base_cols[seq_along(fixed_group_levels)],
fixed_group_levels)
}
# we need a strip color per facet row, in the order of GroupDisp
strip_bg_map <- group_strip_fills[group_label_map]
# ================================================================
# Build plot
# ================================================================
sig_cols <- c(No="#999999", Yes="blue")
p <- ggplot2::ggplot(
ResFull,
aes(y = AbbrevOld, x = result, colour = Significance1)
) +
geom_vline(xintercept = 0, linewidth = 0.3, colour = "#cccccc") +
# ✅ ggplot2 4.0-compliant horizontal error bars
geom_errorbar(
aes(xmin = errord1, xmax = errord2),
width = 0,
na.rm = TRUE
) +
geom_point(size = 1.8, na.rm = TRUE) +
scale_colour_manual(values = sig_cols) +
scale_x_continuous(
limits = xlim,
breaks = scales::pretty_breaks(n = x_breaks_n),
minor_breaks = scales::breaks_width(x_minor_width),
labels = scales::label_number(accuracy = x_label_accuracy),
expand = expansion(mult = x_expand)
) +
ggh4x::facet_grid2(
rows = vars(GroupDisp),
scales = "free_y",
space = "free_y",
switch = "y"
) +
ggh4x::facetted_pos_scales(
y = lapply(fixed_group_levels, function(gl) {
scale_y_discrete(limits = fixed_y_by_group[[gl]], drop = FALSE)
})
) +
xlab("Linear Model Estimates (SE)") +
ylab("Steroids") +
theme_classic() +
theme(
axis.title.x = element_text(size=axis_title_size, face="bold",
margin=margin(t=x_title_top_margin)),
axis.title.y = element_text(size=axis_title_size, face="bold"),
axis.text.x = element_text(size=axis_text_size, face=text_face),
axis.text.y = element_text(size=axis_text_size, face=text_face),
legend.position = "none",
# ✅ NEW strip background fix — safe alternative
strip.background.y = element_rect(
fill = strip_bg_map,
colour = NA
),
strip.text.y.left = element_text(
angle = 90,
hjust = strip_hjust,
vjust = strip_vjust,
size = strip_text_size,
face = strip_face,
colour = "white",
margin = margin(l = strip_margin_left, r = strip_margin_right)
)
)
# ================================================================
# Save outputs (NO MAGICK)
# ================================================================
if (save_outputs) {
if (!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE)
safe_stem <- gsub("[^A-Za-z0-9._-]+","_", name)
width_in <- device_width_px / device_res
height_in <- device_height_px / device_res
# JPG
ggsave(
filename = file.path(output_dir, paste0(safe_stem, ".jpg")),
plot = p,
width = width_in,
height = height_in,
dpi = device_res,
units = "in"
)
# PDF (vector)
ggsave(
filename = file.path(output_dir, paste0(safe_stem, ".pdf")),
plot = p,
width = width_in,
height = height_in
)
# SVG (vector)
ggsave(
filename = file.path(output_dir, paste0(safe_stem, ".svg")),
plot = p,
width = width_in,
height = height_in
)
}
if (return_plot) return(list(plot = p, data = ResFull))
invisible(NULL)
}
run_all_combinations <- function(
groups2,
lm_path,
lm_sheet = "Steroids' adjusted lms",
groups = c("All", "Female", "Male"),
conditions = c("Steatosis", "Fibrosis", "Necroinflammation", "HOMA-IR"),
name_prefix = "Forest plot of",
global_y_order = NULL,
use_qval = FALSE,
alpha = 0.10,
se_multiplier = 0.5,
strict_gender = TRUE,
group_strip_fills = NULL,
shorten_long_group_names = TRUE,
output_dir = getwd(),
save_outputs = TRUE,
return_plots = FALSE,
verbose = TRUE,
...
) {
stopifnot(is.data.frame(groups2))
req_cols <- c("Group","Abbreviation","Abbreviation_old","Name")
stopifnot(all(req_cols %in% colnames(groups2)))
if (!file.exists(lm_path)) stop("LM results file not found: ", lm_path)
ord <- build_fixed_orders(groups2, global_y_order = global_y_order)
fixed_group_levels <- ord$group_levels
fixed_y_by_group <- ord$group_to_abbr
global_y_order <- ord$global_y_order
logs <- list(); plots <- list(); k <- 0
for (g in groups) {
for (cond in conditions) {
k <- k + 1
plot_name <- paste(name_prefix, g, "Steroid Coefs in", cond)
if (isTRUE(verbose)) message(sprintf("[INFO] %s | %s", g, cond))
out <- tryCatch({
CalculateErrors(
Group = g,
name = plot_name,
groups2 = groups2,
lm_path = lm_path,
lm_sheet = lm_sheet,
lm_condition = cond,
use_qval = use_qval,
alpha = alpha,
se_multiplier = se_multiplier,
strict_gender = strict_gender,
fixed_group_levels = fixed_group_levels,
fixed_y_by_group = fixed_y_by_group,
global_y_order = global_y_order,
group_strip_fills = group_strip_fills,
shorten_long_group_names = shorten_long_group_names,
output_dir = output_dir,
save_outputs = save_outputs,
return_plot = return_plots,
...
)
}, error = function(e) e)
if (inherits(out, "error")) {
msg <- conditionMessage(out)
if (isTRUE(verbose)) message("[WARN] Skipped ", g, " | ", cond, " — ", msg)
logs[[k]] <- data.frame(Group=g, Condition=cond, Status="error", Message=msg, stringsAsFactors=FALSE)
} else {
if (isTRUE(verbose) && isTRUE(save_outputs)) {
message(sprintf("[OK] Saved to: %s", normalizePath(output_dir)))
}
logs[[k]] <- data.frame(Group=g, Condition=cond, Status="ok", Message="", stringsAsFactors=FALSE)
if (isTRUE(return_plots)) plots[[paste(g, cond, sep = " | ")]] <- out$gtable
}
}
}
log_df <- do.call(rbind, logs)
if (isTRUE(return_plots)) return(list(log = log_df, plots = plots))
log_df
}
outdir <- file.path(getwd(), "forest_plots_fix")
dir.create(outdir, showWarnings = FALSE)
global_order <- c("11b-OHA4","11-KDHT","11-KT","11-KA4","A4","AN","DHEA","DHT","T/Epi-T",
"E2","E1","S","E","A","B","DOC","17a-OHP5","17a-OHP4","P5","P4")
my_strip_fills <- c(
Androgens = "#1f77b4",
Estrogens = "#d62728",
Glucocorticoids = "#2ca02c",
Mineralocorticoids = "#9467bd",
Progestogens = "#ff7f0e"
)
# groups2red=groups2
# groups2red[groups2red[,1]=='Estrogens',1]='Estrog.'
# In case you need run just one case and gender/s:
# CalculateErrors(
# Group = "All",
# name = "TEST_All_Steatosis4a",
# groups2 = groups2,
# lm_path = "lms_tikka19324_v2.xlsx",
# lm_sheet = "Steroids' adjusted lms",
# lm_condition = "Steatosis",
# global_y_order = global_order,
# group_strip_fills = my_strip_fills,
# # label placement toward panel & margins
# strip_vjust = 0.28, strip_margin_left = 8, strip_margin_right = 8,
# # fonts
# axis_title_size = 12, axis_text_size = 12, strip_text_size = 12,
# # x-axis
# symmetric_x = TRUE, x_breaks_n = 7, x_nice_step = 0.05,
# x_minor_width = 0.025, x_expand = c(0.02,0.02), x_label_accuracy = 0.01,
# # device
# device_width_px = 5000, device_height_px = 8800, device_res = 800,
# output_dir = outdir, save_outputs = TRUE, return_plot = FALSE
# )
#2) Batch
log_df <- run_all_combinations(
groups2 = groups2,
lm_path = "lms_tikka19324_v2.xlsx",
lm_sheet = "Steroids' adjusted lms",
groups = c("All","Female","Male"),
conditions = c("Steatosis","Fibrosis","Necroinflammation","HOMA-IR"),
name_prefix = "Forest plot of",
global_y_order = global_order,
group_strip_fills = my_strip_fills,
# same visual knobs as above (they’ll pass through):
strip_vjust = 0.28, strip_margin_left = 8, strip_margin_right = 8,
# fonts
axis_title_size = 14, axis_text_size = 12, strip_text_size = 12,
# x-axis
symmetric_x = TRUE, x_breaks_n = 7, x_nice_step = 0.05,
x_minor_width = 0.025, x_expand = c(0.02,0.02), x_label_accuracy = 0.01,
# device
device_width_px = 5000, device_height_px = 8800, device_res = 800,
output_dir = outdir, save_outputs = TRUE, return_plot = FALSE
)
log_df
# Fyi: I was able to revise some of the above codes with Copilot...
Forest plot of All Steroid Ratios in Steatosis
Forest plot of Female Steroid Ratios in Steatosis
Forest plot of Male Steroid Ratios in Steatosis