data.table recode in selected columns

data.table recode in selected columns - r

So I'm struggling with data.table. How do I make v1 and v3 numeric?
dt = data.table(v1 = c('1','2','3'), v2 = c(1,2,3), v3 = c('1','2','3'))
dt[,c(1,3), with = F] = lapply(dt[,c(1,3), with = F], as.numeric)

Try this:
dt <- data.table(v1 = c('1','2','3'), v2 = c(1,2,3), v3 = c('1','2','3'))
dt[,':='(v1=as.numeric(v1),v3=as.numeric(v3))]
sapply(dt,class)

Related

How to drop all NA columns in a SparkDataFrame with SparkR?

Once again, I'm facing a problem that I can't transcribe under SparkR.
I have a SparkDataFrame which some columns contain only NAs, and I want to delete all these columns.
I discovered SparkR recently, I think I'm far from understanding all its operation, but it's very frustrating to block on a point yet not so complicated...
Here is the reprex and the way I am doing it in R :
library(data.table)
df <- data.frame(V1 = base::sample(1:10,5), V2 = base::rep(NA,5), V3 = base::sample(1:10,5), V4 = base::rep(NA,5), V5 = base::rep(NA,5), X = runif(n = 5, min = 0, max = 5))
sdf <- createDataFrame(df)
dt <- setDT(df)
na.lst <- sapply(dt, function(x) all(is.na(x)))
dt[, which(na.lst) := NULL]
Thanks !

You can consider the following approach
library(SparkR)
df <- data.frame(V1 = base::sample(1 : 10,5),
V2 = base::rep(NA,5),
V3 = base::sample(1 : 10,5),
V4 = base::rep(NA,5),
V5 = base::rep(NA,5),
X = runif(n = 5, min = 0, max = 5))
sdf <- createDataFrame(df)
col_Names <- colnames(sdf)
nb_Col_Names <- length(col_Names)
vec_Bool <- rep(FALSE, nb_Col_Names)
for(i in 1 : nb_Col_Names)
{
dim_Temp <- dim(dropna(select(sdf, col = col_Names[i]), how = "all"))
if(dim_Temp[1] != 0) vec_Bool[i] <- TRUE
}
col <- col_Names[vec_Bool]
newdf <- select(sdf, col = col)
as.data.frame(newdf)
V1 V3 X
1 6 1 2.286716
2 10 3 3.532843
3 2 9 2.030851
4 8 6 3.304420
5 4 10 1.596272
See Remove columns with only NA values with SparkR

data.table, apply function to portion of a table

I want to apply function to portion of a table.
With data.frame, no problem:
df <- data.frame(name = paste("a", 1:10, sep = "-"),
x = 1:10,
y = rep(1:5),
z = rep(1:2, each = 5))
df[2:5, -1] <- scale(df[2:5, -1], center = c(1,2,3), scale = c(4,5,6))
But data.table complains:
dt <- data.table(name = paste("a", 1:10, sep = "-"),
x = 1:10,
y = rep(1:5),
z = rep(1:2, each = 5))
dt[2:5, -1] <- scale(dt[2:5, -1], center = c(1,2,3), scale = c(4,5,6))
Error in [<-.data.table(*tmp*, 2:5, -1, value = c(0.25, 0.5, 0.75, :
Item 1 of column numbers in j is -1 which is outside range [1,ncol=4]. Use column names
instead in j to add new columns.
What is the correct way in data.table? Thanks!

data.table needs more work to apply scale :
library(data.table)
cols <- names(dt)[-1]
dt[, (cols) := lapply(.SD, as.numeric), .SDcols = cols]
dt[2:5, (cols) := Map(scale, .SD, c(1,2,3), c(4,5,6)), .SDcols = cols]

Apply function to list of dataframes and columns matching pattern

I have a list of dataframes and I would like to apply a function to specific columns that follow a pattern across all the dataframes in the list.
Here is an example list of dataframes:
k_2 <- data.frame(Site = c(rep("A",3), rep("B",2)), V1 = c(1,2,3,4,5), V2 = c(1,2,3,4,5))
k_3 <- data.frame(Site = c(rep("A",3), rep("B",2)), V1 = c(1,2,3,4,5), V2 = c(1,2,3,4,5), V3 = c(1,2,3,4,5))
k_4 <- data.frame(Site = c(rep("A",3), rep("B",2)), V1 = c(1,2,3,4,5), V2 = c(1,2,3,4,5), V3 = c(1,2,3,4,5), V4 = c(1,2,3,4,5))
my.list <- list(k_2, k_3, k_4)
my.list
I want to apply this
k2_res <- ddply(k_2, "Site", function(x) colSums(x[c("V1", "V2")])/nrow(x))
to all the dataframes in the list. However, for K_3 the calculation will need to be colSums(x[c("V1","V2","V3")]) and k_4 will go up to V4 and so on.
Ideas
I thought that maybe I could use some sort of grep or regrex to automatically select all the columns beginning with V?

Are you looking for something like below?
lapply(
my.list,
function(df) ddply(df, "Site", function(x) colSums(x[grepl("V\\d+", names(x))]) / nrow(x))
)

R Loop code over several lists of dataframes

I have several lists of dataframes and I want to format the date in each single dataframe within all lists of dataframes. Here is an example code:
v1 = c("2000-05-01", "2000-05-02", "2000-05-03", "2000-05-04", "2000-05-05")
v2 = seq(2,20, length = 5)
v3 = seq(-2,7, length = 5)
v4 = seq(-6,3, length = 5)
df1 = data.frame(Date = v1, df1_Tmax = v2, df1_Tmean = v3, df1_Tmin = v4)
dfl1 <- list(df1, df1, df1, df1)
names(dfl1) = c("ABC_1", "DEF_1", "GHI_1", "JKL_1")
v1 = c("2000-05-01", "2000-05-02", "2000-05-03", "2000-05-04", "2000-05-05")
v2 = seq(3,21, length = 5)
v3 = seq(-3,8, length = 5)
v4 = seq(-7,4, length = 5)
df2 = data.frame(Date = v1, df2_Tmax = v2, df2_Tmean = v3, df2_Tmin = v4)
dfl2 <- list(df2, df2, df2, df2)
names(dfl2) = c("ABC_2", "DEF_2", "GHI_2", "JKL_2")
v1 = c("2000-05-01", "2000-05-02", "2000-05-03", "2000-05-04", "2000-05-05")
v2 = seq(4,22, length = 5)
v3 = seq(-4,9, length = 5)
v4 = seq(-8,5, length = 5)
df3 = data.frame(Date = v1, df3_Tmax = v2, df3_Tmean = v3, df3_Tmin = v4)
dfl3 <- list(df3, df3, df3, df3)
names(dfl3) = c("ABC_3", "DEF_3", "GHI_3", "JKL_3")
v1 = c("2000-05-01", "2000-05-02", "2000-05-03", "2000-05-04", "2000-05-05")
v2 = seq(2,20, length = 5)
v3 = seq(-2,8, length = 5)
v4 = seq(-6,3, length = 5)
abc = data.frame(Date = v1, ABC_Tmax = v2, ABC_Tmean = v3, ABC_Tmin = v4)
abclist <-list(abc, abc, abc, abc)
names(abclist) = c("ABC_abc", "DEF_abc", "GHI_abc", "JKL_abc")
I know how to change the date-column manually:
dfl1$ABC_1$Date = as.Date(dfl1$ABC_1$Date,format="%Y-%m-%d")
class(dfl1$ABC_1$Date)
But how can I do that for each single Date-Column in all of my lists of dataframes?

Here is one option using get and assign
nms <- c('dfl1', 'dfl2', 'dfl3', 'abclist')
lapply(nms, function(x) assign(x,lapply(get(x),
function(y) {y$Date1 <- as.Date(y$Date, format="%Y-%m-%d")
return(y)}),
envir = .GlobalEnv))
PS: Be careful with assign since it will change your global environment .GlobalEnv. Many R users will suggest the list solution over assign.

This can be done with lapply:
lapply(dfl1, function(x) {
x$Date <- as.Date(x$Date, format="%Y-%m-%d")
return(x)})
If you want to do this for all of you df-lists you need to store them in a list and then you can use a slightly modified version of the above call:
df_list <- list(dfl1, dfl2, dfl3, abclist)
lapply(df_list, function(x) {
x[[1]]$Date <- as.Date(x[[1]]$Date, format="%Y-%m-%d")
return(x)})
This assumes that the Date-column has always the same name "Date".

Correlations between dataframe and list of dataframes in R

I want to calculate correlations between a dataframe and a list of dataframes. Here is my sample:
library(lubridate)
v1 = seq(ymd('2000-05-01'),ymd('2000-05-10'),by='day')
v2 = seq(2,20, length = 10)
v3 = seq(-2,7, length = 10)
v4 = seq(-6,3, length = 10)
df1 = data.frame(Date = v1, Tmax = v2, Tmean = v3, Tmin = v4)
v1 = seq(ymd('2000-05-01'),ymd('2000-05-10'),by='day')
v2 = seq(3,21, length = 10)
v3 = seq(-3,8, length = 10)
v4 = seq(-7,4, length = 10)
abc = data.frame(Date = v1, ABC_Tmax = v2, ABC_Tmean = v3, ABC_Tmin = v4)
v1 = seq(ymd('2000-05-01'),ymd('2000-05-10'),by='day')
v2 = seq(4,22, length = 10)
v3 = seq(-4,9, length = 10)
v4 = seq(-8,5, length = 10)
def = data.frame(Date = v1, DEF_Tmax = v2, DEF_Tmean = v3, DEF_Tmin = v4)
v1 = seq(ymd('2000-05-01'),ymd('2000-05-10'),by='day')
v2 = seq(2,20, length = 10)
v3 = seq(-2,8, length = 10)
v4 = seq(-6,3, length = 10)
ghi = data.frame(Date = v1, GHI_Tmax = v2, GHI_Tmean = v3, GHI_Tmin = v4)
df2 <-list(abc, def, ghi)
names(df2) = c("ABC", "DEF", "GHI")
I want to have all correlation coefficients between df1 and df2, but only columnswise.
For example:
df1$Tmax and all df2*Tmax columns
df1$Tmean and all df2*Tmean columns
df1$Tmin and all df2*Tmin columns
I know that I can access all Tmax columns like that:
lapply(df2, "[[", 2)
I know how to calculate the correlation between 2 single values:
cor.test(df1$Tmax, df2$ABC$ABC_Tmax, method = "spearman")
But how can I do it for all columns at once? I tried this, which is not working:
cor.test(df1$Tmax, lapply(df2, "[[", 2), method = "spearman")
Any ideas?

You could use lapply in combination with mapply to apply cor.test and extract a specific value from the test. For example, to get p.value and estimate we can do
lapply(2:4, function(i) mapply(function(x, y) {
a <- cor.test(x, y, method = "spearman")
c(setNames(a$p.value, "pvalue"), a$estimate)
}, lapply(df2, "[[", i), df1[i]))

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

data.table recode in selected columns - r

So I'm struggling with data.table. How do I make v1 and v3 numeric? dt = data.table(v1 = c('1','2','3'), v2 = c(1,2,3), v3 = c('1','2','3')) dt[,c(1,3), with = F] = lapply(dt[,c(1,3), with = F], as.numeric)

Try this: dt <- data.table(v1 = c('1','2','3'), v2 = c(1,2,3), v3 = c('1','2','3')) dt[,':='(v1=as.numeric(v1),v3=as.numeric(v3))] sapply(dt,class)

Related

How to drop all NA columns in a SparkDataFrame with SparkR?

data.table, apply function to portion of a table

Apply function to list of dataframes and columns matching pattern

R Loop code over several lists of dataframes

Correlations between dataframe and list of dataframes in R

Categories

Resources