# these functions/scripts have the following purpose (output files):
# mkfaa.sh - combine gzipped sequence files into one big FASTA file (refseq45.faa)
# gencat.sh - extract gi number, taxid, sequence length from RefSeq release catalog (gi.taxid.txt)
# protein.refseq.R - get average amino acid composition of each taxid from gzipped sequence files (protein_refseq.csv)

# bash scripts assume a GNU/Linux-like operating system
# timings were made for processing RefSeq 45 (2011-01-28) 
# using R-2.12.0, Linux x86_64, 1.4GHz Intel Core2Duo

## get the list of files and entries in the database
#1. download 'release45.files.installed' and 'RefSeq-release45.catalog' from NCBI
#   (ftp://ftp.ncbi.nih.gov/refseq/release/release-catalog)

## download stuff
#3. run the following command:
#     grep microbial.*.protein.faa* release45.files.installed | \
#       sed -e "s/^/ftp\:\/\/ftp.ncbi.nih.gov\/refseq\/release\/microbial\//g" > urllist
#4. download the files using 'wget -i urllist' [~2 hours]
#5. move the .gz files to a directory named 'protein'
#6. run ls protein/*.gz > filelist
#7. use 'mkfaa.sh' to combine the sequences into a single file [~20 minutes]

## protein stuff
#8. use 'gencat.sh' to generate gi.taxid.txt 
#9. use refseq.protein() to generate protein_refseq.csv  [~5.5 hours]


