prepareData <-
function(file1, file2, nrec = -1, encode.sorc = 'unknown' , encode.trgt = 'unknown', minlen = 5, maxlen = 40, all = FALSE, removePt = TRUE, word_align = TRUE)
{
s_sen = t_sen = aa = t = c()

s_sen = readLines (con <- file(file1), encoding = encode.sorc, n = nrec, warn = FALSE)
close(con)

t_sen = readLines (con <- file(file2), encoding = encode.trgt, n = nrec, warn = FALSE)
close(con)

if (length(s_sen) == length(t_sen))
{
for (k1 in 1 : length (s_sen)) if (s_sen[k1] == '') {t_sen [k1+1] = paste (t_sen [k1], t_sen [k1+1]); t_sen [k1] = ''}

for (k2 in 1 : length (t_sen)) if (t_sen[k2] == '') {s_sen [k2+1] = paste (s_sen [k2], s_sen [k2+1]); s_sen [k2] = ''}
}

s_sen = s_sen [nzchar (s_sen)]
t_sen = t_sen [nzchar (t_sen)]

aa = cbind(s_sen,t_sen)
len1 = nrow(aa)

#------------------------- Tokenization --------------------------
    
aa[,1] = culf (aa [,1], lower = all)
aa[,2] = culf (aa [,2], lower = all)

rm (s_sen, t_sen)
gc ()

aa = tokens(aa, remove_punct = removePt)
len2 = length(aa) / 2

word2 = aa [1 : len2]
word3 = aa [ (len2+1) : (2 * len2)]

aa = cbind (sapply(word2,paste, collapse = ' '), sapply(word3, paste, collapse = ' '))
aa = aa [apply (aa, 1, function(x) prod (vapply (strsplit (x, ' '), length, FUN.VALUE=0) >= minlen)& prod (vapply (strsplit (x, ' '), length, FUN.VALUE=0) <= maxlen) == 1) ,]

if(word_align) 
{
aa = list (len1, aa)
return(aa)
}

len2 = length(aa) / 2
aa = strsplit(aa,' ')

list1 = list (initial = len1, used = len2, sorc.tok = aa [1 : len2], trgt.tok = aa[ (len2 + 1) : (2 * len2)] )

return (list1)
}
