Simulation and analysis with R, GCTA, and LDSC
My work often compares results from GCTA-GREML and LDSC. This note summarizes the end-to-end workflow—from data preparation to calling GCTA/LDSC from R—in a reusable, extensible form.
0. Preparation
Prepare the SNP list, sample list, and LD scores for simulation.
0.1 Prepare data for bigsnpr/bigstatsr
library(bigsnpr)
library(bigstatsr)
PrepareX = FALSE
if(PrepareX){
rds = bigsnpr::snp_readBed2(bedfile = "../../Cric_HF_RealData/rawdata/cric.filtered.maf0.01.bed",
backingfile = "cric_geno",ind.col= ind.column)
geno <- snp_attach(rds)
G <- geno$genotypes
CHR <- geno$map$chromosome
infos <- snp_fastImputeSimple(G, method = "mean2")
geno$genotypes <- infos
geno <- snp_save(geno)
G <- geno$genotypes
ind.keep <- snp_clumping(G, infos.chr = geno$map$chromosome,
infos.pos = geno$map$physical.pos,
thr.r2 = 0.01)
saveRDS(ind.keep, file ="ind_keep_clump.rds")
data.table::fwrite( as.data.frame(geno$map$marker.ID), file = "snplist.txt")
}
0.2 Prepare for GCTA (GRM)
ref.bed = "../../Cric_HF_RealData/rawdata/cric.filtered.maf0.01"
MakeGRM = function(ref.bed, keep.snp, keep.ind, output.dir = "./temp/"){
grm = paste0(output.dir, "geno")
fwrite(data.frame(keep.snp), file = paste0(output.dir,"snplist.txt"), quote = F, sep = "\t", col.names=T, na = "NA")
system(paste0("plink --bfile ", ref.bed, " --extract ", paste0(output.dir, "snplist.txt"), " --make-grm-bin --out ", grm ) )
}
0.3 Prepare LD score for LDSC
ldsc.py --bfile ../../Cric_HF_RealData/rawdata/cric.filtered.maf0.01 --extract ./temp/snplist.txt --ld-wind-kb 1000 --out ./temp/geno
1. GCTA
GCTA is widely used for GWAS analysis, but it can be awkward to integrate into an R-based simulation pipeline. Below is a minimal workflow for running bivariate REML from R.
Inputs
- phenotype file (Y1, Y2)
- genotype file (geno.bed)
First, generate the GRM from the genotype file. This step only needs to be done once.
MakeGRM = function(ref.bed, keep.snp, keep.ind, output.dir = "./temp/"){
grm = paste0(output.dir, "geno")
fwrite(data.frame(keep.snp), file = paste0(output.dir,"snplist.txt"), quote = F, sep = "\t", col.names=T, na = "NA")
system(paste0("plink --bfile ", ref.bed, " --extract ", paste0(output.dir, "snplist.txt"), " --make-grm-bin --out ", grm ) )
}
Next, write the phenotype file using FID and IID.
WritePheno = function( Y1, Y2, FID, IID, pheno.file ){
pheno = data.frame(FID = FID, IID = IID,
Y1 = Y1, Y2 = Y2)
fwrite(pheno, file = pheno.file,
quote = F, sep = "\t", col.names=T, na = "NA")
}
Then run GCTA.
SimuGCTA = function(geno.file, pheno.file, output.dir){
output.file = paste0(output.dir, "GCTA.result")
system(paste0("./gcta64 --reml-bivar --grm ",
geno.file," --pheno ", pheno.file,
" --thread-num 20 --out ", output.file ))
}
If simulated errors have no covariance, use --reml-bivar-nocove for a fair comparison. To test non-zero genetic covariance, use --reml-bivar-lrt-rg 0.
SimuGCTA.test = function(geno.file, pheno.file, output.dir){
output.file = paste0(output.dir, "GCTA.result")
system(paste0("./gcta64 --reml-bivar-nocove --reml-maxit 100 --grm ",
geno.file," --pheno ",
pheno.file," --reml-bivar-lrt-rg 0 --out ",output.file))
}
Read the GCTA output file.
read.GCTA = function(output.dir, LRT = T){
filename = paste0(output.dir, "GCTA.result.hsq")
if(file.exists(filename)){
hsq = data.table::fread(filename, fill = T)
Est = as.vector(t( hsq[1:3, 2:3])) %>%
setNames(c("Heri.1","Heri.1.Se","Heri.2","Heri.2.Se", "GeCv","GeCv.Se" ))
if(LRT){
pval = as.numeric(sapply(strsplit(as.character(hsq[hsq$Source == "Pval",2]), "\\("), "[[", 1))
}
}
return(list(Est = Est, pval = pval))
}
2. LDSC
First, compute LD scores from the genotype file (done once).
module load ldsc
source activate ldsc-1.0.1
ldsc.py --bfile geno --ld-wind-cm 1 --out chr1_select
ldsc.py --bfile ../../Cric_HF_RealData/rawdata/cric.filtered.maf0.01 --extract ./temp/snplist.txt --ld-wind-kb 1000 --out ./temp/geno
Implement LDSC regression.
SimuLDSC = function(Z1, N1, Z2, N2, Nc=0, weight=T, CSTR = T,
info = NA, LDSCORE, corenum = 1, output.dir){
if(length(Z1) != length(Z2) ){stop("Z scores should have the same length.")}
if(!is.na(info)){
print("update info")
Z1.data = data.frame(SNP = info$SNP , Z = Z1, N = N1,
A1 = "A", A2 = "G", stringsAsFactors = F)
colnames(Z1.data) = c("SNP", "Z", "N", "A1", "A2")
fwrite(Z1.data, file = paste0(output.dir, "Z1.sumstats") , quote = F, sep = "\t", col.names=T, na = "NA" )
Z2.data = data.frame(SNP = info$SNP , Z = Z2, N = N2,
A1 = "A", A2 = "G", stringsAsFactors = F)
colnames(Z2.data) = c("SNP", "Z", "N", "A1", "A2")
fwrite(Z2.data, file = paste0(output.dir, "Z2.sumstats"), quote = F, sep = "\t", col.names=T, na = "NA" )
}
if(weight == T){
name = paste0(output.dir, "LDSC")
system(paste0("ldsc.py --rg ",paste0(J, "Z1.sumstats,"), paste0(J, "Z2.sumstats"),
" --ref-ld ", LDSCORE," --w-ld ",LDSCORE," --intercept-h2 1,1 --out ",name))
}else{
stop()
}
}
Read the LDSC output.
read.LDSC = function(output.dir){
H = fread(paste0(output.dir, "LDSC.log"), fill = T)
result.all = rep(NA, 8)
result.gecr = unlist(H[(stringr::str_detect(unlist(H), "Genetic Correlation:"))]) %>% str_split(, pattern = " ") %>% unlist()
result.heri = unlist(H[(stringr::str_detect(unlist(H), "Total Observed scale h2:"))]) %>% str_split(, pattern = " ") %>% unlist()
result.gecov = unlist(H[(stringr::str_detect(unlist(H), "Total Observed scale gencov:"))]) %>% str_split(, pattern = " ") %>% unlist()
result.P = unlist(H[(stringr::str_detect(unlist(H), "P:"))]) %>%
str_split(, pattern = " ") %>% unlist()
if(!is.null(result.gecr)){
result = as.numeric(unlist(regmatches(result.gecr,gregexpr("-?\\ *[0-9]+\\.?[0-9]*(?:[Ee]\\ *-?\\ *[0-9]+)?",result.gecr, perl=TRUE))))
result.all[1] = result[1]
result.all[2] = result[2]
}
if(!is.null(result.heri)){
result = as.numeric(unlist(regmatches(result.heri,
gregexpr("-?\\ *[0-9]+\\.?[0-9]*(?:[Ee]\\ *-?\\ *[0-9]+)?",
result.heri, perl=TRUE))))
result.all[3] = result[2]
result.all[4] = result[3]
result.all[5] = result[5]
result.all[6] = result[6]
}
if(!is.null(result.gecov)){
result = as.numeric(unlist(regmatches(result.gecov,
gregexpr("-?\\ *[0-9]+\\.?[0-9]*(?:[Ee]\\ *-?\\ *[0-9]+)?",
result.gecov, perl=TRUE))))
result.all[7] = result[1]
result.all[8] = result[2]
zscore = result[1]/result[2]
pvalue = 2* (1 - pnorm(abs(zscore)))
}else{
pvalue = NA
}
return( c(GeCv.pv = pvalue, GeCr.pv = as.numeric(result.P[2])) )
}