library(Biobase) library(MPR) ## load SNP alleles at putative SNP sites data(snpData) ## Or: ## snpData <- as.matrix(read.table("MPR_data/snpData_chr05.txt")) ## load overlapping SNP alleles from low-depth sequences of one parent ## you can also use genotype results from traditional genotyping studies, ## e.g. results of SSR/RFLP markers data(markerData) ## Or: ## markerData <- as.matrix(read.table("MPR_data/markerData_chr05.txt",row.names=1,header=T))[,1] ## load negative SNP dataset data(fSNP) ## Or: ## fSNP <- scan("MPR_data/fSNP_chr05.txt",'') ## load postitive SNP dataset data(checkData) ## Or: ## checkData <- as.matrix(read.table("MPR_data/checkData_chr05.txt")) ################################################################################# ## select 50 SNP sites to test inference of parental genotypes set.seed(123);myBaseData <- snpData[sample(200,50),] ## random assignments of parental genotypes to alleles will result in ## a big number of recombinations allele.random <- base2Allele(myBaseData) ## a big number of recombinations NumRecomEvents(myBaseData,allele.random) ## MPR inference with maximum step size of 5 allele.MPR <- localMPR(baseData=myBaseData,maxIterate=50,maxNStep=5,showDetail=TRUE) ## should be a small number compared with random assignments above NumRecomEvents(myBaseData,allele.MPR) set.seed(123) ## select 30 markers randomly markers <- sample(names(markerData)[10:50],20) ## select SNP sites which contain the 30 markers ids <- match(markers,rownames(snpData)) str(myBaseData <- snpData[min(ids):max(ids),]) ## construct positive and negative dataset fsnp <- fSNP tsnp <- rownames(checkData) str(fsnp <- intersect(fsnp,rownames(myBaseData))) str(tsnp <- intersect(tsnp[!tsnp%in%fsnp],rownames(myBaseData))) ## global MPR aiding with marker data allele.MPR <- globalMPRByMarkers(myBaseData,markers=markerData,numTry=3,numBaseStep=50, numBaseCandidateStep=100,numMarkerStep=10,useMedianToFindKnown=TRUE, maxIterate=150,maxNStep=3,scoreMin=0.8,verbose=TRUE) ## genotypes at some SNP site may be missing due to concordence less than 80% (scoreMin) table(is.na(alleleA <- allele.MPR[,1])) ## check with positive dataset. looks good but contains false SNPs and may be with some errors ids <- match(rownames(allele.MPR),rownames(checkData)) table(checkData[ids,1]==alleleA) ## positive data table(tmp <- tsnp%in%names(na.omit(alleleA)));mean(tmp) ## negative data table(tmp <- fsnp%in%names(na.omit(alleleA)));mean(tmp) ## then you need to refine the MPR results set.seed(123);system.time(all.res <- globalMPRRefine(myBaseData,alleleA=na.omit( allele.MPR[,1]),numGroup=238,groupSort=TRUE,numPerm=20,numTry=3, numBaseStep=50,numBaseCandidateStep=100,numKnownStep=30, numKnownCandidateStep=50,useMedianToFindKnown=TRUE,maxIterate=150, maxNStep=3,scoreMin=0.8,saveMidData=TRUE,verbose=TRUE)) ## summarize results using Bayesian inference perm <- 10 res <- all.res$midData[[perm]] table(geno.res <- genotypeCallsBayes(res$call,errorRate=1e-11,eps=1e-10,maxIterate=100, verbose=FALSE)$type) ## reconstruct parental genotypes allele.MPR <- res$allele allele.MPR[geno.res==2|rowMin(res$call)>=perm,] <- NA allele.MPR[geno.res==3,] <- allele.MPR[geno.res==3,c(2,1)] ## predicted alleles of parent 1 table(is.na(alleleA <- allele.MPR[,1])) ## check again using positive and negative dataset ids <- match(names(alleleA),rownames(checkData)) table(checkData[ids,1]==alleleA) ## the power of the MPR algorithm. lost 1 high quality SNP but removed >90% of inferior SNPs ## positive data table(tmp <- tsnp%in%names(na.omit(alleleA)));mean(tmp) ## FALSE TRUE ## 1 1227 ## [1] 0.9991857 ## negative data table(tmp <- fsnp%in%names(na.omit(alleleA)));mean(tmp) ## FALSE TRUE ## 36 3 ## [1] 0.07692308 ################################################################################ ## correct genotypes using HMM ## Example O <- c(1,1,1,1,1,1,1,2,1,1,1,1,1,1,2,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,1,2,2,2,2, 2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,1,1,2,2,1,2,1,2,1,2,1,2,1,2,1,1,1,1,1,1,1,2,1,1,1,1, 1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,2,1,1,1,1,1) O.pos <- 30e3*1:length(O) O.cr <- hmm.vitFUN.rils(geno=O,position=O.pos,geno.probability=c(0.4975, 0.4975,0.005),transitionFUN = phy2get.haldane.rils, emissionFUN = makeEmissionFUN(errorRate = 0.01)) rbind(raw=paste(O,collapse=''),correct=paste(O.cr,collapse='')) ## [,1] ## raw "1111111211111121111111222222221222222221222221222222222222221211221212121212111111121111111111121111111211111" ## correct "1111111111111111111111222222222222222222222222222222222222223333333333333333111111111111111111111111111111111" ## real data snpSet <- sort(rownames(na.omit(allele.MPR))) ids <- match(snpSet,rownames(myBaseData)) table(is.na(geno.data <- base2Geno(myBaseData[ids,],allele.MPR[ids,]))) geno.data.cr <- geno.data SNPbyChr <- split(1:nrow(geno.data),substr(rownames(geno.data),1,2)) for(chr in names(SNPbyChr)){ cat("\r",chr) ids <- SNPbyChr[[chr]] geno.data.chr <- correctGeno(geno.data[ids,],correct.FUN=correctFUNHMM,hmmFUN=hmm.vitFUN.rils, geno.probability=c(0.4975, 0.4975,0.005),transitionFUN=phy2get.haldane.rils, emissionFUN=makeEmissionFUN(errorRate=0.0106)) geno.data.cr[ids,] <- geno.data.chr } ## lump genotypes to bin str(geno.data.subset <- geno.data.cr[rowSums(!is.na(geno.data.cr))>0,]) snpSite <- as.numeric(substr(rownames(geno.data.subset),1,10)) geno.data.bin <- genoToBin(geno.data.subset,base.position=snpSite,corrected=TRUE,size=250e3,num=5,fillSmallNA=TRUE,minBinsize=5e3,heterozygote=FALSE) ## genptype bins with missing data str(geno.bin <- geno.data.bin[[2]]) ################################################################################# library(qtl) phenoData<-as.matrix(read.table("MPR_data/phenoData.txt",header=T,row.name=1)) ## Attention!!!!! the name of SNP MUST BE started with ten numeric characters, the first two characters for chromosome and the other eight charactes for physical position of the SNP, otherwise you have to modify the code of geno2Cross (the row assigning value to myGeno.site) summary(myCrossData <- geno2Cross(geno.bin,phenoData)) cross.data <- myCrossData myMap <- est.map(cross.data, error.prob=0.00, map.function=c("haldane","kosambi","c-f","morgan")[1], m=0, p=0, maxit=1000, tol=1e-6, sex.sp=FALSE, verbose=FALSE, omit.noninformative=TRUE) summary(myMap) plot.map(myMap) cross.data <- replace.map(cross.data, myMap) cross.data <- calc.genoprob(cross.data,error.prob=0.00) cross.fill <- fill.geno(cross.data,method=c("imp","argmax")[2], error.prob=0.00,map.function="haldane"); cross.fill <- calc.genoprob(cross.fill,error.prob=0.00) id.pheno <- c("Grain_width") ## res <- cim(cross.data,pheno.col=id.pheno,n.marcovar=3,window=10,method="em",imp.method=c("imp", "argmax")[2], error.prob=0.01) res <- scanone(cross.fill,pheno.col=id.pheno,method="em") summary(res,threshold=4, lodcolumn=1) i <- which.max(res$lod) res[(i-3):(i+3),] plot.scanone(res,chr=5) ## vtype="S",chr=5,xlim=c(25,30) str(geno.fill <- t(pull.geno(cross.fill))-1) colnames(geno.fill) <- colnames(geno.bin) ## Final Bin Map str(geno.uniq <- mergeBinMap(geno.fill)) ################################################################################# X11(width=10,height=8) par(mfrow=c(2,1),mar=c(4.1,4.1,2.1,1.1)) sel.cols <- 1:10 ## comparisons ## raw geno.colors <- geno.data[,sel.cols]+1;geno.colors[is.na(geno.colors)] <- 0 plot(matrix(1:nrow(geno.data),nrow=nrow(geno.data),ncol=length(sel.cols)), matrix(rep(sel.cols,each=nrow(geno.data)),nrow=nrow(geno.data),ncol=length(sel.cols)),col=geno.colors,pch="|", xlab='',ylab="RIL index",mar=c(1.1,2.1,2.1,1.1)) ## corrected geno.colors <- geno.data.cr[,sel.cols]+1;geno.colors[is.na(geno.colors)] <- 0 plot(matrix(1:nrow(geno.data),nrow=nrow(geno.data),ncol=length(sel.cols)), matrix(rep(sel.cols,each=nrow(geno.data)),nrow=nrow(geno.data),ncol=length(sel.cols)),col=geno.colors,pch="|", xlab="SNP index",ylab="RIL index",mar=c(4.1,2.1,2.1,1.1)) dev.off() sessionInfo() ## R version 2.5.0 (2007-04-23) ## i686-pc-linux-gnu ## locale: ## LC_CTYPE=en_US.UTF-8;LC_NUMERIC=C;LC_TIME=en_US.UTF-8;LC_COLLATE=en_US.UTF-8;LC_MONETARY=en_US.UTF-8;LC_MESSAGES ## =en_US.UTF-8;LC_PAPER=en_US.UTF-8;LC_NAME=C;LC_ADDRESS=C;LC_TELEPHONE=C;LC_MEASUREMENT=en_US.UTF-8;LC_IDENTIFICATION=C ## attached base packages: ## [1] "splines" "tools" "stats" "graphics" "grDevices" "utils" ## [7] "datasets" "methods" "base" ## other attached packages: ## MPR genefilter survival Biobase ## "0.1" "1.14.1" "2.34" "1.14.1"