#' @title Format \code{\link{meta.retrieval}} output
#' @description Process the output of \code{\link{meta.retrieval}} by first
#' un-zipping downloaded files and renaming them for more convenient downstream data analysis.
#' @param x a vector containing file paths to the output files generated by \code{\link{meta.retrieval}}.
#' @param gunzip a logical value indicating whether or not files should only be renamed (\code{gunzip = FALSE}) or renamed AND unzipped (\code{gunzip}).
#' @author Hajk-Georg Drost
#' @details The output of \code{\link{meta.retrieval}} usually contains compressed sequence files
#' and a naming convention based on the database the respective file was retrieved from (e.g. \code{Saccharomyces_cerevisiae_cds_from_genomic_refseq.fna.gz}). 
#' This function helps to format the \code{\link{meta.retrieval}} output files by
#' \itemize{
#' \item 1) Automatically uncompress all sequence files in the \code{meta.retrieval} output folder
#' \item 2) Automatically rename files from e.g. \code{Saccharomyces_cerevisiae_cds_from_genomic_refseq.fna.gz} to \code{Scerevisiae.fa}.
#' This allows more convenient downstream analyses and visualizations.
#' }
#' @seealso \code{\link{meta.retrieval}}
#' @examples
#' \dontrun{
#' # The easiest way to use 'clean.retrieval()' in combination with
#' # 'meta.retrieval()' is to use the pipe operator from the 'magrittr' package
#' library(magrittr)
#' meta.retrieval(kingdom = "vertebrate_mammalian", 
#'                db = "refseq", 
#'                type = "genome") %>% clean.retrieval()
#' }
#' @export

clean.retrieval <- function(x, gunzip = TRUE) {
    
    if (any(!file.exists(x)))
        stop("Some of the meta.retrieval() output files seem not to exist. Please provide valid file paths to meta.retrieval() output files.", call. = FALSE)
    
    if (gunzip)
        message("Cleaning file names and unzipping files ...")
    
    if (!gunzip)
        message("Cleaning file names ...")
    
    folder_files <- list.files(dirname(x)[1])
    
    if (length(folder_files) == 0)
        stop("Unfortunately, your specified folder '", x, "' does not include any files.", call. = FALSE)
    
    file_ext <- "[.]*a$"
        
    if (any(stringr::str_detect(folder_files, "[.]faa.gz$"))) {
        seq_type <- "ncbi_protein"
        file_ext <- "[.]faa$"
    }
        
    if (any(stringr::str_detect(folder_files, "[.]fna.gz$"))) {
        seq_type <- "ncbi_nucleotide"
        file_ext <- "[.]fna$"
    }
    if (any(stringr::str_detect(folder_files, "[.]gff.gz$"))) {
        seq_type <- "ncbi_gff"
        file_ext <- "[.]gff$"
    }
    if (any(stringr::str_detect(folder_files, "[.]out.gz$"))) {
        seq_type <- "ncbi_rm"
        file_ext <- "[.]out$"
    }
    if (any(stringr::str_detect(folder_files, "[.]gff3.gz$"))) {
        seq_type <- "ensembl_gff3"
        file_ext <- "[.]gff3$"   
    }
    if (any(stringr::str_detect(folder_files, "[.]gtf.gz$"))) {
        seq_type <- "ensembl_gtf"
        file_ext <- "[.]gtf$"
    }
    if (any(stringr::str_detect(folder_files, "[.]fa.gz$"))) {
        seq_type <- "ensembl_fasta"
        file_ext <- "[.]fa$"   
    }
    
    
    # remove doc, md5checksum files, and already unzipped files
    find_doc <- which(stringr::str_detect(folder_files, "doc_"))
    find_md5 <- which(stringr::str_detect(folder_files, "md5checksum"))
    find_documentaion <- which(stringr::str_detect(folder_files, "documentation"))
    find_unzipped_files <- which(stringr::str_detect(folder_files, file_ext))
        
    if (length(c(find_doc, find_md5, find_documentaion, find_unzipped_files)) > 0) {
        folder_files_reduced <- folder_files[-c(find_doc, find_md5, find_documentaion, find_unzipped_files)]
    } 
    
    if (length(folder_files_reduced) == 0) {
        message("It seems that nothing needs to be done. All files are unzipped.")
        return(file.path(x, folder_files[-c(find_doc, find_md5, find_documentaion)]))
    } else {
        input_files <- folder_files_reduced
    }  
    
    input_files_without_appendix <- unlist(lapply(input_files, function(x) return(unlist(stringr::str_split(x, "[.]"))[1])))
    
    file_ext <- stringr::str_replace(file_ext, "\\$", "") 
    file_ext <- stringr::str_replace(file_ext, "\\[.]", "")
        
    if (gunzip)
        output_files <- paste0(tidy_name(input_files_without_appendix), ".", file_ext)
    
    if (!gunzip)
    output_files <- paste0(tidy_name(input_files_without_appendix),".",file_ext,".gz")
    
    if (!all(file.exists(file.path(dirname(x)[1], input_files))))
        stop("Something went wrong during the cleaning process. Some input files seem not to exist.", call. = FALSE)
    
    if (gunzip) {
        for (i in seq_len(length(input_files))) {
            if (file.exists(file.path(dirname(x)[1], input_files[i]))) {
                message("Unzipping file ", input_files[i],"' ...")
                R.utils::gunzip(file.path(dirname(x)[1], input_files[i]), destname = file.path(dirname(x)[1], output_files[i]))   
            }
        }
    }
    
    message("Finished formatting.")
    
    return(file.path(dirname(x)[1], output_files))
}
