How to add multiple calculated columns to a SparkDataFrame using SparkR? - r

Now I'm stuck on a rather basic case, but I can't find a clever solution with SparkR ...
From N columns in my SparkDataFrame, I need to create N new calculated columns.
df <- data.frame(V1 = base::sample(1:10,5),
V2 = base::sample(1:10,5),
V3 = base::sample(1:10,5),
V4 = base::sample(1:10,5),
V5 = base::sample(1:10,5),
X = runif(n = 5, min = 0, max = 5))
sdf <- createDataFrame(df)
sdf <- withColumn(sdf, "V1_X", column("X") / column("V1"))
sdf <- withColumn(sdf, "V2_X", column("X") / column("V2"))
sdf <- withColumn(sdf, "V3_X", column("X") / column("V3"))
sdf <- withColumn(sdf, "V4_X", column("X") / column("V4"))
sdf <- withColumn(sdf, "V5_X", column("X") / column("V5"))
Basically, I want to apply a function to my vector/list of columns names.
Easy in R. In SparkR, I am able to lapply a function, but I modify the original columns. Something escapes me...!?
Thanks !

Maybe you can consider the following approach
library(SparkR)
df <- data.frame(V1 = base::sample(1:10,5),
V2 = base::sample(1:10,5),
V3 = base::sample(1:10,5),
V4 = base::sample(1:10,5),
V5 = base::sample(1:10,5),
X = runif(n = 5, min = 0, max = 5))
sdf <- createDataFrame(df)
col_Names <- colnames(sdf)
nb_Col_Ratio <- length(col_Names) - 1
for(i in 1 : nb_Col_Ratio)
{
new_Col_Name <- paste(col_Names[i], "X")
sdf <- withColumn(sdf, new_Col_Name, column("X") / column(col_Names[i]))
}
print(as.data.frame(sdf))

I usually prefer doing these kind of tasks using SparkR::select command, instead of looping over multiple withColumn statements. The same logic can be used in Python using Python's list comprehension. See the modified version of your code below:
df <- data.frame(V1 = base::sample(1:10,5),
V2 = base::sample(1:10,5),
V3 = base::sample(1:10,5),
V4 = base::sample(1:10,5),
V5 = base::sample(1:10,5),
X = runif(n = 5, min = 0, max = 5))
sdf <- SparkR::createDataFrame(df)
# sdf <- SparkR::withColumn(sdf, "V1_X", SparkR::column("X") / SparkR::column("V1"))
# sdf <- SparkR::withColumn(sdf, "V2_X", SparkR::column("X") / SparkR::column("V2"))
# sdf <- SparkR::withColumn(sdf, "V3_X", SparkR::column("X") / SparkR::column("V3"))
# sdf <- SparkR::withColumn(sdf, "V4_X", SparkR::column("X") / SparkR::column("V4"))
# sdf <- SparkR::withColumn(sdf, "V5_X", SparkR::column("X") / SparkR::column("V5"))
loop_on_cols <- SparkR::colnames(sdf)[SparkR::colnames(sdf)!="X"]
sdf2 <- SparkR::select(
sdf,
c(
SparkR::colnames(sdf),
lapply(
loop_on_cols,
function(c) {
SparkR::alias(SparkR::column("X")/SparkR::column(c), paste0(c,"_X"))
}
)
)
)
SparkR::head(sdf2)

Related

How to drop all NA columns in a SparkDataFrame with SparkR?

Once again, I'm facing a problem that I can't transcribe under SparkR.
I have a SparkDataFrame which some columns contain only NAs, and I want to delete all these columns.
I discovered SparkR recently, I think I'm far from understanding all its operation, but it's very frustrating to block on a point yet not so complicated...
Here is the reprex and the way I am doing it in R :
library(data.table)
df <- data.frame(V1 = base::sample(1:10,5), V2 = base::rep(NA,5), V3 = base::sample(1:10,5), V4 = base::rep(NA,5), V5 = base::rep(NA,5), X = runif(n = 5, min = 0, max = 5))
sdf <- createDataFrame(df)
dt <- setDT(df)
na.lst <- sapply(dt, function(x) all(is.na(x)))
dt[, which(na.lst) := NULL]
Thanks !
You can consider the following approach
library(SparkR)
df <- data.frame(V1 = base::sample(1 : 10,5),
V2 = base::rep(NA,5),
V3 = base::sample(1 : 10,5),
V4 = base::rep(NA,5),
V5 = base::rep(NA,5),
X = runif(n = 5, min = 0, max = 5))
sdf <- createDataFrame(df)
col_Names <- colnames(sdf)
nb_Col_Names <- length(col_Names)
vec_Bool <- rep(FALSE, nb_Col_Names)
for(i in 1 : nb_Col_Names)
{
dim_Temp <- dim(dropna(select(sdf, col = col_Names[i]), how = "all"))
if(dim_Temp[1] != 0) vec_Bool[i] <- TRUE
}
col <- col_Names[vec_Bool]
newdf <- select(sdf, col = col)
as.data.frame(newdf)
V1 V3 X
1 6 1 2.286716
2 10 3 3.532843
3 2 9 2.030851
4 8 6 3.304420
5 4 10 1.596272
See Remove columns with only NA values with SparkR

Combine variables into numeric vector and find distance between them

I have four numeric variables that I would like to combine into two vectors, and then take the distance between those vectors.
df = data.frame(V1 = 1:10,
V2 = 11:20,
V3 = 21:30,
V4 = 31:40)
I can create the vectors this way:
df2 <- df %>%
mutate(vector1 = mapply(c, V1, V2, SIMPLIFY = F),
vector2 = mapply(c, V3, V4, SIMPLIFY = F))
But I haven't been able to force them to be numeric so I can't calculate the distance between them:
# want to be able to do something like this
df2 %>%
mutate(distance = sqrt(sum((vector1 - vector2) ^ 2)))
I've tried all sorts of combinations of:
distance_df$vector1 <- lapply(distance_df$vector1, as.numeric)
distance_df$vector1 <- as.numeric(as.character(distance_df$vector1))
I must be missing something quite obvious since this doesn't seem that difficult.
might this be an option?
library(tidyverse)
df = data.frame(V1 = 1:10,
V2 = 11:20,
V3 = 21:30,
V4 = 31:40)
df %>%
rowwise() %>%
mutate(distance = sqrt(sum((c(V1,V2) - c(V3,V4)) ^ 2)))

R Loop code over several lists of dataframes

I have several lists of dataframes and I want to format the date in each single dataframe within all lists of dataframes. Here is an example code:
v1 = c("2000-05-01", "2000-05-02", "2000-05-03", "2000-05-04", "2000-05-05")
v2 = seq(2,20, length = 5)
v3 = seq(-2,7, length = 5)
v4 = seq(-6,3, length = 5)
df1 = data.frame(Date = v1, df1_Tmax = v2, df1_Tmean = v3, df1_Tmin = v4)
dfl1 <- list(df1, df1, df1, df1)
names(dfl1) = c("ABC_1", "DEF_1", "GHI_1", "JKL_1")
v1 = c("2000-05-01", "2000-05-02", "2000-05-03", "2000-05-04", "2000-05-05")
v2 = seq(3,21, length = 5)
v3 = seq(-3,8, length = 5)
v4 = seq(-7,4, length = 5)
df2 = data.frame(Date = v1, df2_Tmax = v2, df2_Tmean = v3, df2_Tmin = v4)
dfl2 <- list(df2, df2, df2, df2)
names(dfl2) = c("ABC_2", "DEF_2", "GHI_2", "JKL_2")
v1 = c("2000-05-01", "2000-05-02", "2000-05-03", "2000-05-04", "2000-05-05")
v2 = seq(4,22, length = 5)
v3 = seq(-4,9, length = 5)
v4 = seq(-8,5, length = 5)
df3 = data.frame(Date = v1, df3_Tmax = v2, df3_Tmean = v3, df3_Tmin = v4)
dfl3 <- list(df3, df3, df3, df3)
names(dfl3) = c("ABC_3", "DEF_3", "GHI_3", "JKL_3")
v1 = c("2000-05-01", "2000-05-02", "2000-05-03", "2000-05-04", "2000-05-05")
v2 = seq(2,20, length = 5)
v3 = seq(-2,8, length = 5)
v4 = seq(-6,3, length = 5)
abc = data.frame(Date = v1, ABC_Tmax = v2, ABC_Tmean = v3, ABC_Tmin = v4)
abclist <-list(abc, abc, abc, abc)
names(abclist) = c("ABC_abc", "DEF_abc", "GHI_abc", "JKL_abc")
I know how to change the date-column manually:
dfl1$ABC_1$Date = as.Date(dfl1$ABC_1$Date,format="%Y-%m-%d")
class(dfl1$ABC_1$Date)
But how can I do that for each single Date-Column in all of my lists of dataframes?
Here is one option using get and assign
nms <- c('dfl1', 'dfl2', 'dfl3', 'abclist')
lapply(nms, function(x) assign(x,lapply(get(x),
function(y) {y$Date1 <- as.Date(y$Date, format="%Y-%m-%d")
return(y)}),
envir = .GlobalEnv))
PS: Be careful with assign since it will change your global environment .GlobalEnv. Many R users will suggest the list solution over assign.
This can be done with lapply:
lapply(dfl1, function(x) {
x$Date <- as.Date(x$Date, format="%Y-%m-%d")
return(x)})
If you want to do this for all of you df-lists you need to store them in a list and then you can use a slightly modified version of the above call:
df_list <- list(dfl1, dfl2, dfl3, abclist)
lapply(df_list, function(x) {
x[[1]]$Date <- as.Date(x[[1]]$Date, format="%Y-%m-%d")
return(x)})
This assumes that the Date-column has always the same name "Date".

Correlations between dataframe and list of dataframes in R

I want to calculate correlations between a dataframe and a list of dataframes. Here is my sample:
library(lubridate)
v1 = seq(ymd('2000-05-01'),ymd('2000-05-10'),by='day')
v2 = seq(2,20, length = 10)
v3 = seq(-2,7, length = 10)
v4 = seq(-6,3, length = 10)
df1 = data.frame(Date = v1, Tmax = v2, Tmean = v3, Tmin = v4)
v1 = seq(ymd('2000-05-01'),ymd('2000-05-10'),by='day')
v2 = seq(3,21, length = 10)
v3 = seq(-3,8, length = 10)
v4 = seq(-7,4, length = 10)
abc = data.frame(Date = v1, ABC_Tmax = v2, ABC_Tmean = v3, ABC_Tmin = v4)
v1 = seq(ymd('2000-05-01'),ymd('2000-05-10'),by='day')
v2 = seq(4,22, length = 10)
v3 = seq(-4,9, length = 10)
v4 = seq(-8,5, length = 10)
def = data.frame(Date = v1, DEF_Tmax = v2, DEF_Tmean = v3, DEF_Tmin = v4)
v1 = seq(ymd('2000-05-01'),ymd('2000-05-10'),by='day')
v2 = seq(2,20, length = 10)
v3 = seq(-2,8, length = 10)
v4 = seq(-6,3, length = 10)
ghi = data.frame(Date = v1, GHI_Tmax = v2, GHI_Tmean = v3, GHI_Tmin = v4)
df2 <-list(abc, def, ghi)
names(df2) = c("ABC", "DEF", "GHI")
I want to have all correlation coefficients between df1 and df2, but only columnswise.
For example:
df1$Tmax and all df2*Tmax columns
df1$Tmean and all df2*Tmean columns
df1$Tmin and all df2*Tmin columns
I know that I can access all Tmax columns like that:
lapply(df2, "[[", 2)
I know how to calculate the correlation between 2 single values:
cor.test(df1$Tmax, df2$ABC$ABC_Tmax, method = "spearman")
But how can I do it for all columns at once? I tried this, which is not working:
cor.test(df1$Tmax, lapply(df2, "[[", 2), method = "spearman")
Any ideas?
You could use lapply in combination with mapply to apply cor.test and extract a specific value from the test. For example, to get p.value and estimate we can do
lapply(2:4, function(i) mapply(function(x, y) {
a <- cor.test(x, y, method = "spearman")
c(setNames(a$p.value, "pvalue"), a$estimate)
}, lapply(df2, "[[", i), df1[i]))

function returns relating columns with the corresponding column removed

This is my code so far:
record <- function(input, string){
filter(input, input$race == string |
input$flag == string)
}
Please help
You could try which. Using data from #RuiBarradas:
set.seed(1234)
recordings <- data.frame(V1 = sample(LETTERS, 10),
V2 = sample(LETTERS, 10),
V3 = letters[1:10], stringsAsFactors = FALSE)
records <- function(recordings, string){
rws <- which(recordings == string, arr.ind = TRUE)[,1]
cls <- which(recordings == string, arr.ind = TRUE)[,2]
recordings <- recordings[rws, -cls, drop = FALSE]
return(recordings)
}
For A, it would return:
records(recordings, "A")
V2 V3
7 F g
For X:
records(recordings, "X")
V3
4 d
5 e
This assumes that no value is present in all columns.
If you need to only know the corresponding row values:
records <- function(recordings, string){
return(which(recordings == string, arr.ind = TRUE)[,1])
}
records(recordings, "X")
[1] 4 5
See if the following is what you want.
First I will make up a dataset, since you have not posted one.
set.seed(1234) # Make the results reproducible
recordings <- data.frame(V1 = sample(LETTERS, 10),
V2 = sample(letters, 10),
V3 = sample(4, 10, TRUE))
Now the function.
records <- function(DF, string){
inx <- DF == string
i <- apply(inx, 1, function(x) Reduce('||', x))
DF[i, which(colSums(!inx) == nrow(DF)), drop = FALSE]
}
records(recordings, "A")
# V2 V3
#7 f 3
records(recordings, "x")
# V1 V3
#5 S 1

Resources