So I'm struggling with data.table. How do I make v1 and v3 numeric?
dt = data.table(v1 = c('1','2','3'), v2 = c(1,2,3), v3 = c('1','2','3'))
dt[,c(1,3), with = F] = lapply(dt[,c(1,3), with = F], as.numeric)
Try this:
dt <- data.table(v1 = c('1','2','3'), v2 = c(1,2,3), v3 = c('1','2','3'))
dt[,':='(v1=as.numeric(v1),v3=as.numeric(v3))]
sapply(dt,class)
Related
Once again, I'm facing a problem that I can't transcribe under SparkR.
I have a SparkDataFrame which some columns contain only NAs, and I want to delete all these columns.
I discovered SparkR recently, I think I'm far from understanding all its operation, but it's very frustrating to block on a point yet not so complicated...
Here is the reprex and the way I am doing it in R :
library(data.table)
df <- data.frame(V1 = base::sample(1:10,5), V2 = base::rep(NA,5), V3 = base::sample(1:10,5), V4 = base::rep(NA,5), V5 = base::rep(NA,5), X = runif(n = 5, min = 0, max = 5))
sdf <- createDataFrame(df)
dt <- setDT(df)
na.lst <- sapply(dt, function(x) all(is.na(x)))
dt[, which(na.lst) := NULL]
Thanks !
You can consider the following approach
library(SparkR)
df <- data.frame(V1 = base::sample(1 : 10,5),
V2 = base::rep(NA,5),
V3 = base::sample(1 : 10,5),
V4 = base::rep(NA,5),
V5 = base::rep(NA,5),
X = runif(n = 5, min = 0, max = 5))
sdf <- createDataFrame(df)
col_Names <- colnames(sdf)
nb_Col_Names <- length(col_Names)
vec_Bool <- rep(FALSE, nb_Col_Names)
for(i in 1 : nb_Col_Names)
{
dim_Temp <- dim(dropna(select(sdf, col = col_Names[i]), how = "all"))
if(dim_Temp[1] != 0) vec_Bool[i] <- TRUE
}
col <- col_Names[vec_Bool]
newdf <- select(sdf, col = col)
as.data.frame(newdf)
V1 V3 X
1 6 1 2.286716
2 10 3 3.532843
3 2 9 2.030851
4 8 6 3.304420
5 4 10 1.596272
See Remove columns with only NA values with SparkR
I want to apply function to portion of a table.
With data.frame, no problem:
df <- data.frame(name = paste("a", 1:10, sep = "-"),
x = 1:10,
y = rep(1:5),
z = rep(1:2, each = 5))
df[2:5, -1] <- scale(df[2:5, -1], center = c(1,2,3), scale = c(4,5,6))
But data.table complains:
dt <- data.table(name = paste("a", 1:10, sep = "-"),
x = 1:10,
y = rep(1:5),
z = rep(1:2, each = 5))
dt[2:5, -1] <- scale(dt[2:5, -1], center = c(1,2,3), scale = c(4,5,6))
Error in [<-.data.table(*tmp*, 2:5, -1, value = c(0.25, 0.5, 0.75, :
Item 1 of column numbers in j is -1 which is outside range [1,ncol=4]. Use column names
instead in j to add new columns.
What is the correct way in data.table? Thanks!
data.table needs more work to apply scale :
library(data.table)
cols <- names(dt)[-1]
dt[, (cols) := lapply(.SD, as.numeric), .SDcols = cols]
dt[2:5, (cols) := Map(scale, .SD, c(1,2,3), c(4,5,6)), .SDcols = cols]
I have a list of dataframes and I would like to apply a function to specific columns that follow a pattern across all the dataframes in the list.
Here is an example list of dataframes:
k_2 <- data.frame(Site = c(rep("A",3), rep("B",2)), V1 = c(1,2,3,4,5), V2 = c(1,2,3,4,5))
k_3 <- data.frame(Site = c(rep("A",3), rep("B",2)), V1 = c(1,2,3,4,5), V2 = c(1,2,3,4,5), V3 = c(1,2,3,4,5))
k_4 <- data.frame(Site = c(rep("A",3), rep("B",2)), V1 = c(1,2,3,4,5), V2 = c(1,2,3,4,5), V3 = c(1,2,3,4,5), V4 = c(1,2,3,4,5))
my.list <- list(k_2, k_3, k_4)
my.list
I want to apply this
k2_res <- ddply(k_2, "Site", function(x) colSums(x[c("V1", "V2")])/nrow(x))
to all the dataframes in the list. However, for K_3 the calculation will need to be colSums(x[c("V1","V2","V3")]) and k_4 will go up to V4 and so on.
Ideas
I thought that maybe I could use some sort of grep or regrex to automatically select all the columns beginning with V?
Are you looking for something like below?
lapply(
my.list,
function(df) ddply(df, "Site", function(x) colSums(x[grepl("V\\d+", names(x))]) / nrow(x))
)
I have several lists of dataframes and I want to format the date in each single dataframe within all lists of dataframes. Here is an example code:
v1 = c("2000-05-01", "2000-05-02", "2000-05-03", "2000-05-04", "2000-05-05")
v2 = seq(2,20, length = 5)
v3 = seq(-2,7, length = 5)
v4 = seq(-6,3, length = 5)
df1 = data.frame(Date = v1, df1_Tmax = v2, df1_Tmean = v3, df1_Tmin = v4)
dfl1 <- list(df1, df1, df1, df1)
names(dfl1) = c("ABC_1", "DEF_1", "GHI_1", "JKL_1")
v1 = c("2000-05-01", "2000-05-02", "2000-05-03", "2000-05-04", "2000-05-05")
v2 = seq(3,21, length = 5)
v3 = seq(-3,8, length = 5)
v4 = seq(-7,4, length = 5)
df2 = data.frame(Date = v1, df2_Tmax = v2, df2_Tmean = v3, df2_Tmin = v4)
dfl2 <- list(df2, df2, df2, df2)
names(dfl2) = c("ABC_2", "DEF_2", "GHI_2", "JKL_2")
v1 = c("2000-05-01", "2000-05-02", "2000-05-03", "2000-05-04", "2000-05-05")
v2 = seq(4,22, length = 5)
v3 = seq(-4,9, length = 5)
v4 = seq(-8,5, length = 5)
df3 = data.frame(Date = v1, df3_Tmax = v2, df3_Tmean = v3, df3_Tmin = v4)
dfl3 <- list(df3, df3, df3, df3)
names(dfl3) = c("ABC_3", "DEF_3", "GHI_3", "JKL_3")
v1 = c("2000-05-01", "2000-05-02", "2000-05-03", "2000-05-04", "2000-05-05")
v2 = seq(2,20, length = 5)
v3 = seq(-2,8, length = 5)
v4 = seq(-6,3, length = 5)
abc = data.frame(Date = v1, ABC_Tmax = v2, ABC_Tmean = v3, ABC_Tmin = v4)
abclist <-list(abc, abc, abc, abc)
names(abclist) = c("ABC_abc", "DEF_abc", "GHI_abc", "JKL_abc")
I know how to change the date-column manually:
dfl1$ABC_1$Date = as.Date(dfl1$ABC_1$Date,format="%Y-%m-%d")
class(dfl1$ABC_1$Date)
But how can I do that for each single Date-Column in all of my lists of dataframes?
Here is one option using get and assign
nms <- c('dfl1', 'dfl2', 'dfl3', 'abclist')
lapply(nms, function(x) assign(x,lapply(get(x),
function(y) {y$Date1 <- as.Date(y$Date, format="%Y-%m-%d")
return(y)}),
envir = .GlobalEnv))
PS: Be careful with assign since it will change your global environment .GlobalEnv. Many R users will suggest the list solution over assign.
This can be done with lapply:
lapply(dfl1, function(x) {
x$Date <- as.Date(x$Date, format="%Y-%m-%d")
return(x)})
If you want to do this for all of you df-lists you need to store them in a list and then you can use a slightly modified version of the above call:
df_list <- list(dfl1, dfl2, dfl3, abclist)
lapply(df_list, function(x) {
x[[1]]$Date <- as.Date(x[[1]]$Date, format="%Y-%m-%d")
return(x)})
This assumes that the Date-column has always the same name "Date".
I want to calculate correlations between a dataframe and a list of dataframes. Here is my sample:
library(lubridate)
v1 = seq(ymd('2000-05-01'),ymd('2000-05-10'),by='day')
v2 = seq(2,20, length = 10)
v3 = seq(-2,7, length = 10)
v4 = seq(-6,3, length = 10)
df1 = data.frame(Date = v1, Tmax = v2, Tmean = v3, Tmin = v4)
v1 = seq(ymd('2000-05-01'),ymd('2000-05-10'),by='day')
v2 = seq(3,21, length = 10)
v3 = seq(-3,8, length = 10)
v4 = seq(-7,4, length = 10)
abc = data.frame(Date = v1, ABC_Tmax = v2, ABC_Tmean = v3, ABC_Tmin = v4)
v1 = seq(ymd('2000-05-01'),ymd('2000-05-10'),by='day')
v2 = seq(4,22, length = 10)
v3 = seq(-4,9, length = 10)
v4 = seq(-8,5, length = 10)
def = data.frame(Date = v1, DEF_Tmax = v2, DEF_Tmean = v3, DEF_Tmin = v4)
v1 = seq(ymd('2000-05-01'),ymd('2000-05-10'),by='day')
v2 = seq(2,20, length = 10)
v3 = seq(-2,8, length = 10)
v4 = seq(-6,3, length = 10)
ghi = data.frame(Date = v1, GHI_Tmax = v2, GHI_Tmean = v3, GHI_Tmin = v4)
df2 <-list(abc, def, ghi)
names(df2) = c("ABC", "DEF", "GHI")
I want to have all correlation coefficients between df1 and df2, but only columnswise.
For example:
df1$Tmax and all df2*Tmax columns
df1$Tmean and all df2*Tmean columns
df1$Tmin and all df2*Tmin columns
I know that I can access all Tmax columns like that:
lapply(df2, "[[", 2)
I know how to calculate the correlation between 2 single values:
cor.test(df1$Tmax, df2$ABC$ABC_Tmax, method = "spearman")
But how can I do it for all columns at once? I tried this, which is not working:
cor.test(df1$Tmax, lapply(df2, "[[", 2), method = "spearman")
Any ideas?
You could use lapply in combination with mapply to apply cor.test and extract a specific value from the test. For example, to get p.value and estimate we can do
lapply(2:4, function(i) mapply(function(x, y) {
a <- cor.test(x, y, method = "spearman")
c(setNames(a$p.value, "pvalue"), a$estimate)
}, lapply(df2, "[[", i), df1[i]))