get-started

Find non-linear formulas that fits your input data. You can systematically explore and memoize the possible formulas and it’s cross-validation performance, in a parallel and incremental fashon. Three interoperable search functions are available:

The library is designed with a massive parallelization use in mind, that’s why you have a base.filepath for shared results among all parallel processors, and a res.filepath with local results only.

Periodically, the multiple res.filepath will be joined together in a shared base.filepath.

Load the library and set-up the data

library(symbolicr)

x1<-runif(100, min=2, max=67)
x2<-runif(100, min=0.01, max=0.1)

y <- log10(x1^2*x2) + rnorm(100, 0, 0.001)

X <- data.frame(x1=x1, x2=x2)

Define hyper-parameters

l1.filepath <- paste0('regression/regression',type,'.exploration.l1.rData')
# create file
saveRDS(empty.sample(), l1.filepath)

K=15
N=30
max.formula.len=2

Note that you can control the formula space, and the algorithm employed by just setting three variables

seed
formula.len
n.squares

Thus, you can run multiple independent search processes both on your computer or server-side with a command similar to this one

R --no-echo --no-restore --file=find_f.R --args 3 2 1006

That will run genetic search to find formulas of max len 3, and order up to 2.

You just need to get the variables from command-line

options <- commandArgs(trailingOnly = TRUE)

formula.len <- as.integer(options[1])
n.squares <- as.integer(options[2])
seed <- as.integer(options[3])

otherwise, you can hard-code them

best.n.squares <- n.squares <- 1
best.formula.len <- formula.len <- 2
seed=1010

Define rData file for checkpoint save

Used only in genetic and random search. Exaustive search will append the missing formulas to the file pointed by base.filepath.

if(seed > 0){
  res.filepath <- file.path(out.folder, paste0('regression',type,'.exploration.fl.',formula.len,'.ord.',n.squares,'seed.',seed,'.rData'))
}

Define non-linear transformations

rdf is the full dataset
x is the variable to which we should apply the non-linearity
z is a list with min and absmin fields

transformations <- list(
  "log"=function(rdf, x, z){
    log10(x)
  },
  "log10"=function(rdf, x, z){
    log10(0.1+abs(z$min)+x)
  },
  "log_fwhm_fcrn_p"=function(rdf, x, z){
    pos_x1 <- abs(min(rdf$x1, na.rm = T)) + rdf$x1
    log10(0.1 + abs(z$min) + x + pos_x1)
  },
  "my_log10"=function(rdf, x, z){
    # WHATEVER fancy function you like
    mask <- rdf$x1 > 0.1
    mask[mask == 0] <- NA
    x.nyte <- mask * x
    l <- log10(0.1+abs(z$min)+x.nyte)
    l[is.na(l)] <- 0
    return(l)
  },
  "inv"=function(rdf, x, z){
    1/(0.1+abs(z$min)+x)
  },
  "sigmoid"=function(rdf, x,z){
    1 / ( 1 + exp(-x))
  },
  "invsigmoid"=function(rdf, x,z){
    1/(1 + exp(x))
  }
)

Start symbolic regression

Based on seed value, run a different algorithm

if(seed == 0){
  base.filepath <- paste0('regression/regression',type,'.exploration.l',formula.len,'.rData')
  res.new <- exaustive.search(regressors.df, l.Fn,
                         n.squares=n.squares,
                         formula.len=formula.len,
                         K=K, N=N, seed=seed,
                         transformations=transformations,
                         custom.abs.mins=list(),
                         glob.filepath = base.filepath,
                         chunk.size=NULL, cv.norm=T)
}else if(seed < 1000){
  base.filepath <- paste0('regression/regression',type,'.exploration.l',formula.len,'.rData')
  # general "random" search to find good candidates
  new.sample.res <- random.search(regressors.df, l.Fn, n.squares, formula.len,
                                  maxiter=10, K=K, N=N,
                                  transformations=transformations,
                                  glob.filepath = base.filepath,
                                  local.filepath = res.filepath,
                                  memoization=T, cv.norm = T)
}else{
  # genetic can change formula length
  base.filepath <- c(
    l1.filepath,
    l2.filepath,
    l3.filepath#,
    #l4.filepath
  )

  # finetuning procedure using genetic algorithm
  best.vars.l <- list(
    c('log.x1')  )
  best.finetuned <- genetic.search(
    regressors.df,
    l.Fn,
    n.squares=n.squares,
    max.formula.len=max.formula.len,
    maxiter=100,
    transformations=transformations,
    glob.filepath=base.filepath,
    local.filepath=res.filepath,
    memoization=T,
    pcrossover=0.1,
    pmutation=0.9,
    seed=NULL,
    keepBest=T,
    K=K,
    N=N,
    popSize = 100,
    best.vars.l=best.vars.l,
    cv.norm = T
  )
}