singularity pull shub://tin6150/DIOS_demonstration singularity exec DIOS_demonstration_latest.sif /opt/conda/bin/jupyter lab --allow-root -or- docker podman run -it --entrypoint=/opt/conda/bin/jupyter tin6150/r4envids lab --allow-root

setenv R_LIBS /path/to/user/R-libraries # setting R_LIBS will change both where R looks for lib to load and when installing libs library( rJava ); # see if it can load the R library named rJava (case sensitive) listing all installed libraries: Rscript -e 'library()' sI <- sessionInfo() sI system('env') # this is running the env command via the shell and display that info system('id -a; hostname') # in jupyter notebook, the system() command output to console # use combination of intern=TRUE and wrap with print() just to be sure print(system( 'uptime', intern=TRUE )) library(data.table) fread( "cat /proc/loadavg", check.names=TRUE ) ## run system command and read result into data frame

setting user's own library install dir. for installing CRAN libraries... You can have a R_LIBS under your home dir. If you setup R_LIBS to be a shared folder, then your student can use the same library that you (or I, not as root) install. mkdir -p ~/rlibs36 export R_LIBS=~/rlibs36 (or R_LIBS=/global/home/groups/ac_seasonal/rlibs36 ) I am assuming this for a new R 3.6, are libraries compatible with old version? For Perl I keep them in separate directories; python is a different beast) # setting R_LIBS will change both where R looks for lib to load and when installing libs Running this as user should install a package called diagram: Rscript --quiet -e 'install.packages("diagram", repos = "http://cran.us.r-project.org")' after install, you should see files added to ~/rlibs36 If the above don't work, start interactive R as user install.packages('diagram',lib='~/rlibs36') this page has a good write up about this issue: https://statistics.berkeley.edu/computing/R-packages

installing packages from cran: (see r4eta or metabolics container) scripted to run from shell: Rscript --quiet -e 'install.packages("ggplot2", repos = "http://cran.us.r-project.org")' Rscript --quiet -e 'install.packages(c("aCRM", "akima", "broom", "cluster", "clusterCrit"), repos = "http://cran.us.r-project.org")' # do NOT have TAB(s) inside the arg list for packages(...), as Rscript can't tolerate it when invoked from a bash script. Rscript --quiet --no-readline --slave ... # --quiet is same as --silent # --slave was to make R run as quietly as possible # --no-readline hopefully do less drawing on screen and make install log more readable... # there isn't a --no-verbose or --no-interactive install many packages in single command: https://stackoverflow.com/questions/29041423/how-to-install-multiple-packages install.packages(c("EIAdata", "gdata", "ggmap", "ggplot2")) # rest omitted have not tried yet: install.r EIAdata gdata ggmap ggplot2 # rest omitted again

capabilities() # see if support X11, etc system("id;pwd") # run OS commands setwd("dirname") # cd to dirname, most useful in interactive R session. source("myFuncList.R") # import function list from file .libPaths() # list path used to scan for R libs .libPaths( c( ./libPaths(), '/home/tin/R/x86_64-pc-linux-gnu-library/3.4' ) ) # add path to library rm(list=ls()) # remove all var in memory

Sys.setenv(TNS_ADMIN="/site/conf/TNS_ADMIN",ORACLE_HOME="/usr/prog/oracle-client/11.1.0.6",ORACLE_SID='') # The TNS_ADMIN may not matter if ORACLE_HOME has a network/admin with working conf for tns name resolution. # ORACLE_SID is very important! if set, the instance specifier in the dbConnect will be ignored, and db always connect to instance specified by ORACLE_SID !! library( 'ROracle' ) ora=Oracle() con=dbConnect(ora,user='user/pass@instance') summary(con) # verify connected to desired db) result=dbGetQuery(con, "select * from tab") # perform SQL query result # see result of query (display content of variable) result2=dbListTables( con ) # see list of tables for the connected schema

gc(verbose=TRUE) # explicitly call garbage collection ls() # list all vars object.size( get("myData") ) rm() rm(list=ls()) # remove all var in memory # probably useful in rstudio when playing, but not in actual program! sort( sapply( ls(), function(x) { object.size(get(x))} ),decreasing=TRUE ) # list all vars and their sizes vars are out of scope at the end of function, and gc can happen to them automatically bigmemory leave data on file. some packages are build using bigmemory file-backed big matrix -- https://ljdursi.github.io/beyond-single-core-R/#/132 data = read.big.matrix(...) csv is slow. HDF5 and netCDF recommended.ref: https://ljdursi.github.io/beyond-single-core-R/#/111

• function(arglist){expr}: function definition: do expr with list of arguments arglist • if(cond){expr1}else{expr2}: if-statement: if cond is true, then expr1, else expr2 • for(var in vec) {expr}: for-loop: the counter var runs through the vector vec and does expr each run • while(cond){expr}: while-loop: while cond is true, do expr each run • x[n]: the n-th element # R index starts at 1 • x[m:n]: the m-th to n-th element • x[c(k,m,n)]: specific elements • x[x>m & x<n]: elements between m and n • x$n: element of list or data frame named n • x[["n"]]: idem # [[ ]] in R ... • [i,j]: element at i-th row and jth column • [i, ]: row i • return: show variable on the screen (used in functions) • as.numeric or as.character: change class to number or character string • strptime: change class from character to date-time (POSIX) ?"%*%" # get help on topic, quote it if it has "funny" symbols %*% # matrix multiply %/% # ?? partitioning data into partition for shipping to different nodes %:% # compose or nest foreach objects - https://ljdursi.github.io/beyond-single-core-R/#/84 $>% # like unix pipe, used a lot by dplyr %% # modulus, ie reminder. eg 7%%3 is 1 2^4 and 2**4 both seems to be power, get 16. 2^4 notation maybe preferred.

NA is the token to indicate a value that is not availabe. Kinda like UNDEF. It is not a string, and it is not NULL. It is spelled exactly as NA , both caps. > foo=NA > foo [1] NA > bar=na Error: object 'na' not found • is.na: test if variable is NA R by default will not compute on dataset that contain NA. If you don’t mind about the missing data and want to compute the statistics anyway, you can add the argument na.rm=TRUE (Should I remove the NAs? Yes!). > max(j, na.rm=TRUE) [1] 2

• lm(v1∼v2): linear fit (regression line) between vector v1 on the y-axis and v2 on the x-axis • nls(v1∼a+b*v2, start=ls(a=1,b=0)): nonlinear fit. Should contain equation with variables (here v1 and v2 and parameters (here a and b) with starting values • coef: returns coefficients from a fit • summary: returns all results from a fit • sum: sum of a vector • mean: mean of a vector • sd: standard deviation of a vector • aggregate(x,by=ls(y),FUN="mean"): split data set x into subsets (defined by y) and computes means of the subsets. Result: a new vector. • approx: interpolate. Argument: vector with NAs. Result: vector without NAs.

- "UNM Stat 5101"

- CDF and PPF in Excel, R and Python (good graphical guide covering what the p, q, d, r functions cover.)

- SciPy stats function

prefix: p, q, d, r :: p for "probability", the cumulative distribution function (c.d.f.) F(x) = P(X <= x) Answer What is P(X < 27.4) when X has Normal distribution with mean 50 and standard deviation of 20 aka N(50,20). N often styled as cursive cap N. if variance (σ) is given, use the relation s.d. = sqrt(sigma) pnorm( 27.4, mean=50, sd=20, lower.tail=TRUE ) pnorm( 27.4, 50, 20 ) In R, optional param default to mean=0, sd=1, lower.tail=TRUE. Python: scipy.stats.norm.cdf( 27.4, loc=50, scale=20 ) Excel: norm.dist( 27.4, 50, 20, TRUE ) For calculation of the p-value in a right tailed test for a given z-score: 1 - pnorm( 27.4, 50, 20 ) pnorm(27.4, 50, 20, lower.tail=FALSE ) # lower.tail=FALSE -> upper/right tail Python: 1 - scipy.stats.norm.cdf( 27.4, 50, 20 ) scipy.stats.norm.sf( 27.4, 50, 20 ) # sf = 1 - cdf q for "quantile", the inverse c.d.f. - aka PPF(q) for probability ( 1 - α ) p = F(x) x = F-inv(p) F-inv is styled as F^-1. So given a number p between zero and one, qnorm looks up the p-th quantile of the normal distribution. Answer What is F-inv(0.95) when X has the N(100, 15^2) distribution? qnorm(0.95, 100, 15) qnorm(0.95, mean=100, sd=15, lower.tail=TRUE) qnorm(0.05, mean=100, sd=15, lower.tail=FALSE) # from the relation (1-alpha) + alpha = 1, here alpha=0.95 Python: scipy.stats.norm.ppf( 0.95, loc=100, scale=15 ) Excel: norm.inv( 0.95, 100, 15 ) . Excel has a norm.s.inv(0.05), but only take mean=0 sd=1. d for "density", the density function (p.f. or p.d.f.) r for "random", a random variable having the specified distribution Generating random numbers from a standard normal distribution N(miu=0,sigma=1) # N cursive, miu=mean of 0, sigma=standard deviation of 1 rnorm(n=1, mean=0, sd=1 ) Python: scipy.stats.norm.rvs( loc=0, scale=1, size=1, random_state=123 ) # random_state is optional, use seed for repeatable/determinsistic result If n or size > 1 get a vector. Excel: norm.s.inv( rand() ) pnorm, qnorm, dnorm, and rnorm. are the p-, q-, d-, r- fn for Normal Distribution pbinom qbinom dbinom rbinom - for binomial distribution pchisq qchisq dchisq rchisq - for Chi-Square scipy.stats.chi2.cdf() etc pexp qexp dexp rexp - for Exponential plogis qlogis dlogis rlogis - for Logistic plnorm qlnorm dlnorm rlnorm - for Log Normal ppois qpois dpois rpois - for Poisson pt qt dt rt - for Student t scipy.stats.t.cdf() etc punif qunif dunif runif - for Uniform Note that for continuous distribution, some of these fn are defined using integrals, and R doesn't do integrations! All function take option mean and standard deviation. If omitted, default mean=0, sd=1. pnorm(27.4, mean=50, sd=20) pnorm(27.4, 50, 20)

a = 5 a <- 5 # both = and <- can be used for assignment # But <- is prefered by the R crowd. it also screw with my html code here, so maybe R should go to .rst # there are some subtle differences, side effect, formula, etc. # see https://renkun.me/2014/01/28/difference-between-assignment-operators-in-r/ # overall, for better readability of R code, # it was suggested to only use <- for assignment and = for specifying named parameters. # But I was not convinced and will likely continue to use =

1D: vector: homegenoeus elements. c() list: heterogeneous elements. list() 2D: matrix: homegenoeus elements. matrix() dataframe: heterogeneous elements, named columns. think table of data/spreadsheet :-D. data.frame(), tidyverse 3D, multi-dimention: array: homogeneous elements. 1-indexed. array(), dim() attribute() structure() factor() to store categorical data -- ie predefined values

vec1 = c( 1, 3, 5, 7) # c = concatenate, paste together. vec2 = c( 2, 4, 6, 8) vec3 = seq(10,25,5) # aka: vec3 = seq(from=10,to=25,by=5) # result: [1] 10 15 20 25 sum(vec2) > mat1=matrix( data = c(1,2,3,4,5,6), ncol=3 ) # ncol define how many column the matrix has > mat1 [,1] [,2] [,3] [1,] 1 3 5 [2,] 2 4 6 > mat[2,2] [1] 4 matlb/octave matrix syntax is A LOT nicer :-D pay special attention to the sequence of the numbers. cbind # column bind to create matrix rbind # row bind

A 1D array is a vector.

A 2D array is a matrix.

3D and higher dim structure is rarer in stats.

dim() length() names() rownames() colnames() dimnames() t() - transpose c() generalizes to cbind() and rbind() for matrix abind() for arrays as.matrix() as.array() -- for conversions

coord = data.frame(x = runif(n.total), y = runif(n.total), sampled = 0) coord.sampled = coord[coord$sampled == 1,] # initial sites data.frame (coord) is like a table and $ separate the table name and the column name. Thus, the above eg has coord$x coord$y coord$sampled each of them is an array (representing the cell values across many rows of the table). coord$x[1] # table coord, column x, row 1 # R is 1-indexed. (unlike python 0-indexed) coord$x[2] # table coord, column x, row 2 > t = data.frame(x = c(11,12,14), y = c(19,20,21), z = c(10,9,7)) > t x y z 1 11 19 10 2 12 20 9 3 14 21 7 > mean(t$x) [1] 12.33333 typeof( t ) calss( t ) is.data.frame( t ) cbind(t, data.frame( z = 3:1 ) rbind(t, data.frame(...) )It’s a common mistake to try and create a data frame by cbind()ing vectors together. This doesn’t work because cbind() will create a matrix unless one of the arguments is already a data frame. Instead use data.frame() directly

Since a data frame is a list of vectors, it is possible for a data frame to have a column that is a list

List in a simple form is a vector. but it can be arbitrary number of object, each having different length. syntax looks like it has some nested structure, pay close attention. > L2=list(vec1,vec2) > L2 [[1]] [1] 1 3 5 7 [[2]] [1] 2 4 6 8 > L = list(one=1, two=c(1,2),five=seq(1, 4, length=5)) > L $one [1] 1 $two [1] 1 2 $five [1] 1.00 1.75 2.50 3.25 4.00 [ ] returns a list [[ ]] returns the object which is stored in list If it is a named list, then List$name or List[["name"]] will return same as List[[ ]] While List["name"] returns a list ref: https://stackoverflow.com/questions/32819539/proper-way-to-access-list-elements-in-r for list returned by ans = mccollect( list(jobs...) ) typically ans[[1]], ans[[2]] etc will contain the original answer desired.

R has this concept of formula, and, at least from a user perspective, is different than function. (Actually, a formula can be called 'f' and another function 'f' and they can coexist and be called correctly by the right context!)

For python programmer, forget about programming for a bit. Put on the stat thinking cap :)

Remember the flower classification example in data science class? stack overflow describes this construct:

as "Model Species depends on Sepal Length, Sepal Width, Petal Length and Petal Width"

The tilde (~) operator is often used for regression model.

The LHS of the ~ operator is the name of the model (Species).

The RHS are the variables that is fed into the model, with the + operator being overloaded (don't literary means add them, but to treat each as "columns" to find covariance). Most models use data frames, and each "variable" is a column vector of data.

A formula capture delays the evaluation of an expression so that you can later evaluate it with f_eval(). ie

f <- ~ 1 + 2 + 3The formula 'f' will be defined, but the RHS expression will not be calculated yet.

Only when

f_eval(f)is executed is the expression calculated.

Above was a one-sided formula.

Below is a two-sided formula:

g <- y ~ x + zlazyeval provides abstraction like:

f_rhs(g) # x + z f_lhs(g) # y

lm and glm are logistic regresion models. the form y ~ model is interpreted as a specification that the response y is modelled by a linear predictor specified symbolically by model. Such a model consists of a series of terms separated by + operators.

Model operators:

+ separate a series of terms that affect the model. : ?? * factor crossing: a*b interpreted as a+b+a:b ^ crossing to specific degree. eg (a+b)^2 for crossing twice ^in% terms on the left are nested within those on the right. - remove dependence on the specified series I() - To avoid this confusion, the function I() can be used to bracket those portions of a model formula where the operators are used in their arithmetic sense

- The lattice package uses them to specify the variables to plot.
- The ggplot2 package uses them to specify panels for plotting.
- The dplyr package uses them for non-standard evaulation.

x <- c(1:100) y <- seq(0.1,10,0.1) # all these 3 are the same: plot(y ~ I(x^3)) plot(I(x^3), y) plot(x^3, y) # no need to use I() function as ~ operator never used. # note how ~ syntatically swapped the order of the independent and dependent variables x and y # but this below is very different: # ^ is interpreted as model operator for cross reference. ( due to presence of ~ operator) ploy(y ~ x^3)StackOverflow more extensive discussion on formula Regression in Python usinr R-style formula seems to discuss use of ~ and regression formula in R.

import pandas as pd from statsmodels.formula.api import ols model = ols("MPG ~ Year", data=df) results = model.fit()

%>% is defined by magrittr, which is used by dplyr. It is the equivalent of unix shell pipe. iris %>% head() %>% summary() is equiv to summary(head(iris)) which one to use is a matter of preference.

apply(t(beaver1),1,max) ## apply function max to data, 1=row-wise) lapply() returns a list sapply() returns a vector instead of a list tapply() does grouping before running summary function on them by() ... think of GROUP_BY in SQL

geoR::lifkit ## likfit - Likelihood Based Parameter Estimation For Gaussian Random Fields ## https://www.rdocumentation.org/packages/geoR/versions/1.8-1/topics/likfit SimAnneal() kridge ~ raster::extract() makCluster registerDoParallel %dopar%

- https://towardsdatascience.com/from-r-vs-python-to-r-and-python-aa25db33ce17
- PypeR - R within Python
- pyRserve - connect to a remote R server
- rpy2 - runs embedded R in a Python process

- rPython
- SnakeCharmR - fork, more modern ver of rPython
- PythonInR
- reticulate

Tutorials

Chris Paciorek (UC Berkeley Stats Dept) Parallel Processing for Distributed Computing Cover R, Python, Matlab and C, as well as how to use them in a Slurm cluster.

Jonathan Dursi Beyond Single-Core R a slide deck on Parallelization in R. Summary slides:

Dursi's github repo has data for running his example, and I wrapped that with a Jupyter Notebook container so everyone can try using this Docker Container with Jupyter notebook server:

Start jupyter notebook web server (on specific port, eg 6997): docker run -p 6997:6997 -v "$PWD":/mnt -it --entrypoint=/opt/conda/bin/jupyter tin6150/beyond-single-core-r lab --allow-root --no-browser --port=6997 --ip=0.0.0.0 Point web browser to something like http://127.0.0.1:6997/ and paste the token URL link shown on the terminal console /mnt is a mount of the current dir (PWD) where you started the docker process, and files written there will persist after the container is terminated. (Other files inside the container are ephemeral!)

mcparallel (likely just called parallel in early days)

mccollect

mc.core

Ref: Beyond single core R

mc* are multi-core routines mcparallel is good for parallel tasks mclapply is good for data parallelism - easily replace lapply() mc.cores really should represent number of tasks, core can be oversubscribed often with little perf degradation

parallel::mclapply - multi-core lapply - https://nceas.github.io/oss-lessons/parallel-computing-in-r/parallel-computing-in-r.html instead of result = lapply(starts, fx) use result = mclapply(starts, fx, cores=4) mclapply gathers up the responses from each of these function calls, and returns a list of responses that is the same length as the list or vector of input data (one return per input item). starts is a variable holding data, eg rep(100, 40) fx is a custom function that lapply() runs pvec - for simple use case of applying func to each element of vector and return vector, pvec is easier to use # https://ljdursi.github.io/beyond-single-core-R/#/48 https://ljdursi.github.io/beyond-single-core-R/#/30 mcparallel # fork a task , run in background mccollect # wait and collect result. so kinda like MPI scatter/gather library(multicore) ## functionality replaced by library('parallel') fun1 = function() {Sys.sleep(10); 1} fun2 = function() {Sys.sleep(5); 2} f1 = parallel(fun1()) ## new fn name is mcparallel() f2 = parallel(fun2()) result = collect(list(f1, f2)) result[[1]] # would contain result of f1Linear Regression Model fitting, in series vs parallel

https://ljdursi.github.io/beyond-single-core-R/#/32 and slide 33

fit1 = lm(DISTANCE ~ AIR_TIME, data=jan2010)) fit2 = lm(ARR_DELAY ~ DEP_DELAY, data=jan2010)) -vs- parfits = function() { pfit1 = mcparallel(lm(DISTANCE ~ AIR_TIME, data=jan2010)) pfit2 = mcparallel(lm(ARR_DELAY ~ DEP_DELAY, data=jan2010)) mccollect( list(pfit1, pfit2) ) } parfits() Note that mcollect asked for the tasks output to be combined into a list. Some data structure maybe needed to extract more complex model result??domino data lab on caret/doMC

library(doMC) library(caret) # popular ML using foreach to parallelize CV-folds, etc. fit.lda.ser vs fit.lda.par

stopCluster()

doParallel()

registerDoSEQ()

snow

foreach

%do%

%dopar%

for is run for its side effect foreach returns result, side effects maybe gone (when using parallel backend). foreach should be tought of lapply, not for loop %do% is serial Parallel libs: doParallel, doMC, doMPI cl = parallel::makeCluster(4) cl = parallel::makeCluster(4, type = "FORK") # unix fork(), default? cl = parallel::makeCluster(4, type = "PSOCK") # Posix Sockets creates a brand new session in R, nothing inherited from parent. if do this, may as well use MPI cluster and really scale it. cl = parallel::makeForkCluster(4) use registerDoSEQ() if want to force foreach to run in serial much of the parallelization is done using fork() in unix (may not work in windows). Also, fork() allow access to vars/memory from parent, but result not returned at the end. Need to do some special collection/return if need to save those data. Thus, "side-effects" such as those produced by `for` vanishes in parallelization. SNOW - like PSOCK, entirely new process, so ned to explicitly copy data. but process can be on remote machine. https://ljdursi.github.io/beyond-single-core-R/#/55 clusterCall() clusterEvalQ() # easier to use for tasks

NumThread=15 library(doParallel) cl = makeCluster( NumThread, outfile="mkcluster.out" ) ## create a cluster object registerDoParallel(cl) foreach(i=1:3) %dopar% sqrt(i) # the %dopar% says to run the foreach loop in parallel for each item # all console output inside cluster object will go to outfile # there are eg combine=rbind to merge result stopCluster(cl) .combine=rbind # row-bind, ie, each result becomes a row .combine=cbind # col-bind, ie, each result becomes a col .combine=c # c() ? detectCores() ## return number of cores on machineRef:

- CRAN doParallel
- R Doc on doParallel
- Quick Intro to Parallel Computing in R
- Beyond Single-Core R (and github source at https://github.com/ljdursi/beyond-single-core-R

library(parallel) library(Rdsm) nrows = 7 cl = makePSOCKcluster(3) # form 3-process PSOCK (share-nothing) cluster init = mgrinit(cl) # initialize Rdsm mgrmakevar(cl,"m",nrows,nrows) # make a 7x7 shared matrix bar = makebarr(cl) clusterEvalQ(cl,myid = myinfo$id) clusterExport(cl,c("nrows")) dmy = clusterEvalQ(cl,myidxs = getidxs(nrows)) dmy = clusterEvalQ(cl, m[myidxs,1:nrows] = myid) dmy = clusterEvalQ(cl,"barr()") print(m[,])

ship data to other nodes of the cluster: cl = makeCluster(8) clusterExport(cl, "jan2010") cares = clusterApply(cl, rep(5,4), do.n.kmeans) stopCluster(cl) mcres = mclapply(rep(5,4), do.n.kmeans, mc.cores=4)https://ljdursi.github.io/beyond-single-core-R/#/59

# Running across machines: # 2 machine, each with 8 core of process? hosts = c( rep("localhost",8), rep("192.168.0.10", 8) ) cl = makePSOCKcluster(names=hosts) clusterCall(cl, rnorm, 5) clusterCall(cl, system, "hostname") stopCluster(cl)

scheduling viz of run library(snow,quiet=TRUE) do.kmeans.nclusters <- function(n) { kmeans(jan2010, centers=n, nstart=10) } cl = makeCluster(2) clusterExport(cl,"jan2010") tm = snow.time( clusterApply(cl, 1:6, do.kmeans.nclusters) ) plot(tm) # load balance, below is more like mc.preschedule=FALSE, kicking off task to worker only when needed tm.lb = snow.time(clusterApplyLB(cl, 1:6, do.kmeans.nclusters)) plot(tm.lb)https://ljdursi.github.io/beyond-single-core-R/#/73 - Summary for parallel/snow

foreach parallel .multicombine=TRUE %:% nest foreach object # slide 84MPI cluster

https://ljdursi.github.io/beyond-single-core-R/#/149 init() rank = comm.rank() ... allreduce(length(dta), op="sum") comm.print(data.min) finalize()

cl = makeCluster(2, type = "SOCK") clusterCall(cl, function() { library(doSNOW) registerDoSNOW(makeCluster(2, type = "SOCK")) NULL }) registerDoSNOW(cl)

[Doc URL:
http://tin6150.github.io/psg/R.html]

[Doc URL: http://tin6150.gitlab.io/psg/R.html]

Last Updated: 2020-10-28

(cc) Tin Ho. See main page for copyright info.

[Doc URL: http://tin6150.gitlab.io/psg/R.html]

Last Updated: 2020-10-28

(cc) Tin Ho. See main page for copyright info.

psg101 sn50 tin6150

hoti1

nSarCoV2

bofh1