#' Convert a VCF file into a pooldata object.
#' @description Convert VCF files into a pooldata object.
#' @param vcf.file The name (or a path) of the Popoolation sync file (might be in compressed format)
#' @param poolsizes A numeric vector with haploid pool sizes
#' @param poolnames A character vector with the names of pool
#' @param min.rc Minimal allowed read count per base (options silenced for VarScan vcf). Bases covered by less than min.rc reads are discarded and considered as sequencing error. For instance, if nucleotides A, C, G and T are covered by respectively 100, 15, 0 and 1 over all the pools, setting min.rc to 0 will lead to discard the position (the polymorphism being considered as tri-allelic), while setting min.rc to 1 (or 2, 3..14) will make the position be considered as a SNP with two alleles A and C (the only read for allele T being disregarded). For VarScan vcf, markers with more than one alternative allele are discarded because the VarScan AD field only contains one alternate read count.
#' @param min.cov.per.pool Minimal allowed read count (per pool). If at least one pool is not covered by at least min.cov.perpool reads, the position is discarded
#' @param max.cov.per.pool Maximal allowed read count (per pool). If at least one pool is covered by more than min.cov.perpool reads, the position is discarded
#' @param min.maf Minimal allowed Minor Allele Frequency (computed from the ratio overal read counts for the reference allele over the read coverage)
#' @param nlines.per.readblock Number of Lines read simultaneously. Should be adapted to the available RAM.
#' @param nthreads Number of available threads for parallelization of some part of the parsing (default=1, i.e., no parallelization)
#' @return A pooldata object containing 7 elements:
#' \enumerate{
#' \item "refallele.readcount": a matrix with nsnp rows and npools columns containing read counts for the reference allele (chosen arbitrarily) in each pool
#' \item "readcoverage": a matrix with nsnp rows and npools columns containing read coverage in each pool
#' \item "snp.info": a matrix with nsnp rows and four columns containing respectively the contig (or chromosome) name (1st column) and position (2nd column) of the SNP; the allele in the reference assembly (3rd column); the allele taken as reference in the refallele matrix.readcount matrix (4th column); and the alternative allele (5th column)
#' \item "poolsizes": a vector of length npools containing the haploid pool sizes
#' \item "poolnames": a vector of length npools containing the names of the pools
#' \item "nsnp": a scalar corresponding to the number of SNPs
#' \item "npools": a scalar corresponding to the number of pools
#' }
#' @details Genotype format in the vcf file for each pool is assumed to contain either i) an AD field containing allele counts separated by a comma (as produced by popular software such as GATK or samtools/bcftools) or ii) both a RD (reference allele count) and a AD (alternate allele count) as obtained with the VarScan mpileup2snp program (when run with the --output-vcf option). The underlying format is automatically detected by the function. For VarScan generated vcf, it should be noticed that SNPs with more than one alternate allele are discarded (because only a single count is then reported in the AD fields) making the min.rc unavailable. The VarScan --min-reads2 option might replace to some extent this functionalities although SNP where the two major alleles in the Pool-Seq data are different from the reference allele (e.g., expected to be more frequent when using a distantly related reference genome for mapping) will be disregarded.
#' @examples
#'  make.example.files(writing.dir=tempdir())
#'  pooldata=vcf2pooldata(vcf.file=paste0(tempdir(),"/ex.vcf.gz"),poolsizes=rep(50,15))
#' @export
vcf2pooldata<-function(vcf.file="",poolsizes=NA,poolnames=NA,min.cov.per.pool=-1,min.rc=1,max.cov.per.pool=1e6,min.maf=0.01,nlines.per.readblock=1000000,nthreads=1){
  if(nthreads>1){
    tmp.ncores=detectCores()
    if(nthreads>tmp.ncores){nthreads=tmp.ncores}
    options(cores=nthreads)
    registerDoParallel()  ;  getDoParWorkers()  
    parallel=TRUE
  }
  if(nthreads<=1){parallel=FALSE}
  if(nchar(vcf.file)==0){stop("ERROR: Please provide the name of the vcf file as generated by e.g. VarScan")}
  if(sum(is.na(poolsizes))>0){stop("ERROR: Please provide a vector of Pool Sizes (poolsize argument)")}
  i<-NULL #workaround (see ttps://stackoverflow.com/questions/9439256/how-can-i-handle-r-cmd-check-no-visible-binding-for-global-variable-notes-when) to avoid note with check: "no visible binding for global variable"  
  ###############################internal function to parse data  
  parse_data_internal_ <-function(count_data,snpdet_data,npools,n.index,ad.index,rd.index,gatk,min.rc){
    npos=nrow(count_data)
    posindex=1:npos
    YY=NN=matrix(0,npos,npools)
    #count the number of alternate allele (1 in the case of VarScan vcf: cf supra)
    nalt_all=lengths(regmatches(snpdet_data[,4], gregexpr(",", snpdet_data[,4]))) + 1
    #start by treating strictly bi-allelic markers (tmp.nalt_all==1), expected to be the vast majority (more efficient algo)
    posindex.cur=posindex[nalt_all==1]
    for(i in 1:npools){
      snpcalled=lengths(regmatches(count_data[posindex.cur,i], gregexpr(":", count_data[posindex.cur,i]))) == (n.index - 1)
      snpcalled=posindex.cur[snpcalled]
      cnt.dat=matrix(unlist(strsplit(count_data[snpcalled,i],split = ":")),ncol=n.index,byrow=TRUE)
      if(gatk){
        cnt.dat=matrix(unlist(strsplit(cnt.dat[,ad.index],split = ",")),ncol=2,byrow=TRUE)
      }else{
        cnt.dat=cnt.dat[,c(rd.index,ad.index)]
      }
      YY[snpcalled,i]=as.numeric(cnt.dat[,1])
      NN[snpcalled,i]=YY[snpcalled,i] + as.numeric(cnt.dat[,2]) 
    }
    #then consider snps with more than 1 allele (necessarily in GATK format): less efficient algo
    if(gatk){
      list_alt_all_count=unique(nalt_all[nalt_all>1])
      if(length(list_alt_all_count)>1){
        for(cc in list_alt_all_count){
          posindex.cur=posindex[nalt_all==cc]
          ncur_pos=length(posindex.cur)
          array.cnt=array(0,dim=c(ncur_pos,cc+1,npools))
          overall.cnt=matrix(0,ncur_pos,cc+1)
          for(i in 1:npools){
            snpcalled=lengths(regmatches(count_data[posindex.cur,i], gregexpr(":", count_data[posindex.cur,i]))) == (n.index - 1)
            snpcalled=posindex.cur[snpcalled]
            cnt.dat=matrix(unlist(strsplit(count_data[snpcalled,i],split = ":")),ncol=n.index,byrow=TRUE)
            cnt.dat=matrix(as.numeric(unlist(strsplit(cnt.dat[,ad.index],split = ","))),ncol=cc+1,byrow=TRUE)
            array.cnt[,,i]=cnt.dat
            overall.cnt=overall.cnt+array.cnt[,,i]
          }
          #Convert to bi-allelic SNPs according to min.rc criterion
          overall.cnt[overall.cnt<=min.rc]=0
          truesnp.index=which(rowSums(overall.cnt>0)==2) #Warning: local index (e.g., valid for tmp.nomalleles but not for tmpYN)
          truesnp.overall.index=posindex.cur[truesnp.index]
          n.truesnp=length(truesnp.index)
          if(n.truesnp>0){
            #allele names (to obtain the names of the two selected
            nomalleles=cbind(snpdet_data[posindex.cur,3],matrix(unlist(strsplit(snpdet_data[posindex.cur,4],split = ",")),ncol=cc,byrow=TRUE))
            for(i in 1:n.truesnp){#unfornately simpler to do a loop on snp (their number should be small)
              cnt.index=which(overall.cnt[truesnp.index[i],]>0)
              snpdet_data[truesnp.overall.index[i],cnt.index]=nomalleles[truesnp.index[i],cnt.index]
              YY[truesnp.overall.index[i],]=array.cnt[truesnp.index[i],cnt.index[1],]
              NN[truesnp.overall.index[i],]=YY[truesnp.overall.index[i],] + array.cnt[truesnp.index[i],cnt.index[2],]
            }
          }
        }
      }
    }
    return(list(YY=YY,NN=NN,snpdet=snpdet_data))
  }  
  ###############################  
  poolsizes=as.numeric(poolsizes)
  file.con=file(vcf.file,open="r") 
  continue.reading=TRUE
  nlines.header=0
  while(continue.reading){
    tmp.data=scan(file=file.con,nlines = 1,what="character",quiet=TRUE)
    nlines.header=nlines.header+1
    if(tmp.data[1]=="#CHROM"){continue.reading=FALSE}
    }
  npools=length(tmp.data)-9
  if(length(poolsizes)!=npools){stop("ERROR: The number of pools in the vcf file is different from the length of the vector of pool sizes")}
  if(sum(is.na(poolnames))>0){
    poolnames=paste0("Pool",1:npools)
  }else{
    poolnames=as.character(poolnames)
    if(length(poolnames)!=npools){stop("ERROR: The number of pools in the vcf file is different from the length of vector of pool names")}
  }
  ###
  continue.reading=TRUE
  nlines.read=0
  time1=proc.time()
  while(continue.reading){
    # tmp.data=matrix(scan(file=sync.file,skip=skip,nlines = nlines.per.readblock,what="character",quiet=TRUE),ncol=npools+3,byrow=T)  
    tmp.data=matrix(scan(file=file.con,nlines = nlines.per.readblock,what="character",quiet=TRUE),ncol=npools+9,byrow=T) 
    ##recuperation de la position des AD et RD lors du premier cycle
    if(nlines.read==0){
      tmp.format=unlist(strsplit(tmp.data[1,9],split=":"))
      n.index=length(tmp.format)
      rd.index=0 ; ad.index=0
      ad.index=which(tmp.format=="AD") ; rd.index=which(tmp.format=="RD")
      if(length(ad.index)==0){
        stop("ERROR: No field containing allele depth (AD field) was detected in the vcf file")
      }
      if(length(rd.index)==0){
        GATK=TRUE
        cat("Standard format (i.e., as in Bcftools, GATK, etc.) detected for the AD field: the read count for all the alleles identified are separated by a comma\n")
      }else{
        GATK=FALSE
        cat("VarScan like format detected for allele count data: the AD field contains allele depth for the alternate allele and RD field for the reference allele (N.B., positions with more than one alternate allele will be ignored)\n")
      }
    }
    tmp.nlines.read=nrow(tmp.data)
    if(tmp.nlines.read<nlines.per.readblock){continue.reading=FALSE}
    tmp.data=tmp.data[tmp.data[,5]!=".",]
    if(!GATK){
      #For Varscan vcf's marker with more than 2 alleles (i.e., alt allele field contains a comma) need to be eliminated at this stage because counts for every bases are no more available (only one of the alternate base is considered in the AD field!)
      tmp.data=tmp.data[!grepl(",",tmp.data[,5]),] #discard 
    }
    npos=nrow(tmp.data)
    if(npos>1){
      tmp.snpdet=tmp.data[,c(1,2,4,5)]
      tmp.cnt=tmp.data[,-1:-9]
      rm(tmp.data) 
      if(parallel){
        tmp.nsnp.per.thread=round(npos/nthreads)
        tmp.snp_core=cbind( ((1:nthreads)-1)*tmp.nsnp.per.thread+1 , (1:nthreads)*tmp.nsnp.per.thread )
        tmp.snp_core[nthreads,2]=npos
        ##parallelisation par blocs de locus
        res=foreach(i=1:nthreads,.combine=rbind) %dopar% {
          tmp.posindex=tmp.snp_core[i,1]:tmp.snp_core[i,2]
          internal.res=parse_data_internal_(count_data=tmp.cnt[tmp.posindex,],snpdet_data=tmp.snpdet[tmp.posindex,],npools,n.index,ad.index,rd.index,GATK,min.rc)
          tmp.YN=cbind(internal.res$YY,internal.res$NN)
          #trick to store allele name
          rownames(tmp.YN)=paste(tmp.posindex,internal.res$snpdet[,3],internal.res$snpdet[,4],sep=":")
          tmp.YN
        }
        tmp.Y=res[,1:npools] ; tmp.N=res[,(npools+1):(2*npools)]
        tmp.snpdet[,3:4]=matrix(unlist(strsplit(rownames(res),split=":")),ncol=3,byrow=TRUE)[,-1]
        rm(res)
      }else{ 
        internal.res=parse_data_internal_(count_data=tmp.cnt,tmp.snpdet,npools,n.index,ad.index,rd.index,GATK,min.rc)
        tmp.Y=internal.res$YY
        tmp.N=internal.res$NN
        tmp.snpdet=internal.res$snpdet
        rm(internal.res)
      }  
      ##filtres sur couverture et maf
      tmp.maf=0.5-abs(0.5-rowSums(tmp.Y)/rowSums(tmp.N))
      dum.sel=(rowSums(tmp.N>=min.cov.per.pool)==npools) & (rowSums(tmp.N<=max.cov.per.pool)==npools) & (tmp.maf>min.maf)
      tmp.Y=tmp.Y[dum.sel,] ; tmp.N=tmp.N[dum.sel,] ; tmp.snpdet=tmp.snpdet[dum.sel,]
      if(nlines.read==0){
        data.Y=tmp.Y ; data.N=tmp.N ; snpdet=tmp.snpdet
      }else{
        data.Y=rbind(data.Y,tmp.Y)
        data.N=rbind(data.N,tmp.N)
        snpdet=rbind(snpdet,tmp.snpdet)
      }
      nlines.read=nlines.read+tmp.nlines.read
      time.elapsed=(proc.time()-time1)[3]
      nhours=floor(time.elapsed/3600)
      nminutes=floor((time.elapsed-nhours*3600)/60)
      nseconds=round(time.elapsed-nhours*3600-nminutes*60)
      cat(nlines.read," lines processed in",nhours,"h ",nminutes, "m ",nseconds,"s :",nrow(data.Y),"SNPs found\n")
    }
  }
  close(file.con)
  
  res<-new("pooldata")
  res@npools=npools
  res@nsnp=nrow(data.Y)
  res@refallele.readcount=data.Y ; rm(data.Y)
  res@readcoverage=data.N ; rm(data.N)
  res@snp.info=snpdet[,c(1:4)] #here the ref allele is always the allele in the reference assembly
  rm(snpdet)
  res@poolsizes=poolsizes
  res@poolnames=poolnames
  
  cat("Data consists of",res@nsnp,"SNPs for",res@npools,"Pools\n")
  return(res)
}