Replacing cells in a data frame - r

Let's say I have such a data frame with columns X, Y, Z, T and over 100 rows:
order X Y Z T
i a k b n
j c a b n
As you see, if i-th X and j-th Y have the same value (i.e a), then i-th Z and j-th Z have the same value (b) and i-th T and j-th T have the same value (n)
What I want to do is that if i-th X and j-th Y have the same value (i.e a), then i-th Z = b and j-th Z = n and i-th T = n, j-th T = b
order X Y Z T
i a k b n
j c a n b
I have tried doing this in R by using if else and for loop, but I couldnt.
Can anyone help me do that in R?

It can be done with case_when
library(dplyr)
df1 <- df1 %>%
mutate(Z = case_when(lag(X) == Y~ T, TRUE ~ Z),
T = case_when(lag(X) == Y ~ lag(Z), TRUE ~ T))
-output
df1
# order X Y Z T
#1 i a k b n
#2 j c a n b
data
df1 <- structure(list(order = c("i", "j"), X = c("a", "c"), Y = c("k",
"a"), Z = c("b", "b"), T = c("n", "n")), class = "data.frame", row.names = c(NA,
-2L))

Related

Access all the columns with a particular name in nested lists in R

I wonder how I can access all the columns with a particular name in nested lists. Below there is a reproducible example. How can I call all the "mean" columns and collate all in a single data.frame where the data.frame as two other columns which specify associated classes and Output1/Output2 (Example 1). Example 2 is a little more complicated where the nested "mean" list is a data.frame. I need to access both "ts" and "value" columns. In other words, I need to know the ts corresponding to each value (in addition to classes and Output1/Output2).
Example 1
classes <- c("F", "G", "M", "O")
classes <- structure(unique(classes), names = unique(classes))
S1 = data.frame(X1 = rnorm(100), X2 = rnorm(100), X3 = rnorm(100), X4 = rep(classes, 25))
S2 = data.frame(X1 = rnorm(100), X2 = rnorm(100), X3 = rnorm(100), X4 = rep(classes, 25))
P <- lapply(classes, function(c){
Output1 <- list ("model" = lm(X3~ X1+X2, data = S1),"mean" = apply(S1[S1$X4 == c, 1:3], 2, mean), "sum" = apply(S1[S1$X4 == c, 1:3], 2, sum))
Output2 <- list ("model" = lm(X3~ X1+X2, data = S2), "mean" = apply(S2[S2$X4 == c, 1:3], 2, mean), "sum" = apply(S2[S2$X4 == c, 1:3], 2, sum))
output <- list ("Output1" = Output1, "Output2" = Output2)
return(output)
})
Example 2
classes <- c("F", "G", "M", "O")
classes <- structure(unique(classes), names = unique(classes))
S1 = data.frame( X1 = rnorm(100), X2 = rnorm(100), X3 = rnorm(100), X4 = rep(classes, 25), ts = seq(from = ISOdate(1910,1,1), by = "30 min", length.out = 100 ))
S2 = data.frame( X1 = rnorm(100), X2 = rnorm(100), X3 = rnorm(100), X4 = rep(classes, 25),ts = seq(from = ISOdate(1910,1,1), by = "30 min", length.out = 100 ))
P <- lapply(classes, function(c){
Output1 <- list ("model" = lm(X3~ X1+X2, data = S1),"mean" = data.frame(ts = S1[S1$X4 == c, "ts"],
value = S1[S1$X4 == c, "X1"]) ,
"sum" = apply(S1[S1$X4 == c, 1:3], 2, sum))
Output2 <- list ("model" = lm(X3~ X1+X2, data = S2),
"mean" = data.frame(ts = S2[S2$X4 == c, "ts"],
value = S2[S2$X4 == c, "X1"]),
"sum" = apply(S2[S2$X4 == c, 1:3], 2, sum))
output <- list ("Output1" = Output1, "Output2" = Output2)
return(output)
})
We can get "mean" columns from P using rvest's pluck and bind them together with map_df.
purrr::map_df(P, ~rvest::pluck(.x, "mean"), .id = "Class")
# A tibble: 12 x 3
# Class Output1 Output2
# <chr> <dbl> <dbl>
# 1 F 0.0315 -0.0946
# 2 F 0.0935 0.219
# 3 F 0.155 0.172
# 4 G 0.123 0.182
# 5 G -0.114 -0.128
# 6 G -0.0654 -0.0990
# 7 M 0.111 0.0794
# 8 M -0.176 0.405
# 9 M 0.265 -0.0747
#10 O 0.0207 -0.250
#11 O -0.0407 0.0117
#12 O -0.162 -0.195
In base R, you can do :
temp <- lapply(P, function(x) sapply(x, `[[`, "mean"))
do.call(rbind, Map(cbind.data.frame, temp, Class = names(temp)))
EDIT
For the dataframe example, we can use bind_rows after pluck.
map_df(P, ~rvest::pluck(.x, "mean") %>% bind_rows(.id= "output"), .id = "Class")

Add a column to dataframe in R based on grater/less than condition based on the values in existing columns

I have a dataframe and want to create a new column Z populated with a value of "tw" or "ok" for each record. If x > y, z = "ok" , IF x < y, z = "tw".
x y
a 1 2
b 2 3
c 5 1
result
x y z
a 1 2 tw
b 2 3 tw
c 5 1 ok
Maybe you can try ifelse() like below
df <- within(df,z <- ifelse(x>y,"ok","tw"))
If you do not define the output for the case "x==y", maybe you should add the following line
df$z[df$x==df$y] <- NA
alternatively you can do it on the dataframe directly:
# creation of dataframe
df = data.frame("x" = c(1, 2, 5), "y" = c(2, 3, 1))
# column creation of z
df$z[(df$x > df$y)] <- "ok"
df$z[(df$x < df$y)] <- "tw"
We can do this directly :
df$z <- c("tw", "ok")[(df$x > df$y) + 1]
df
# x y z
#a 1 2 tw
#b 2 3 tw
#c 5 1 ok
Not exactly clear what you want to do when x == y (above assigns "ok").
We can also use case_when from dplyr to assign values based on various condition.
library(dplyr)
df %>%
mutate(z = case_when(x > y ~"ok",
x < y ~"tw",
TRUE ~ NA_character_))
data
df <- structure(list(x = c(1L, 2L, 5L), y = c(2L, 3L, 1L), z = c("tw",
"tw", "ok")), row.names = c("a", "b", "c"), class = "data.frame")

How to find the given value from the range of values?

I have have the below data :
Y z
100-800 a
150-600 b
200-300 c
400-600 d
4000-12000 e
Any help would be really appreciated.
Based on given value of x (i.e x=100) it should find the values in the given ranges of Y and give the corresponding values of Y and z.If the given of x is not in the given ranges of Y then it should find the nearer range and give corresponding values of Y and Z.
DT[, list(OK = 1 %in% seq(Y, Y)), by = Z]
For given value of X=110
output should be
Y Z
100-800 a
For x=200
Y z
100-800 a
150-600 b
200-300 c
For x=12500
Y z
4000-12000 e
We can write a helper function using tidyr::separate to separate columns. In case if there are no indices which fall within the range we compare the value with lowest value and highest value in the dataframe and return the row accordingly.
subset_fun <- function(df, val) {
df1 <- tidyr::separate(df, Y, c("low", "high"), sep = "-",convert = TRUE)
inds <- with(df1, val >= low & val <= high)
if (any(inds))
df[inds, ]
else if (min(df1$low) > val) df[which.min(df1$low), ]
else df[which.max(df1$high), ]
}
subset_fun(df, 100)
# Y z
#1 100-800 a
subset_fun(df, 200)
# Y z
#1 100-800 a
#2 150-600 b
#3 200-300 c
subset_fun(df, 12500)
# Y z
#5 4000-12000 e
subset_fun(df, 0)
# Y z
#1 100-800 a
data
df <- structure(list(Y = structure(1:5, .Label = c("100-800", "150-600",
"200-300", "400-600", "4000-12000"), class = "factor"),
z = structure(1:5, .Label = c("a", "b", "c", "d", "e"), class = "factor")),
class = "data.frame", row.names = c(NA, -5L))
We can create a helper and use this to subset:
library(dplyr)
library(tidyr)
find_number <- function(x,high,low){
x >= low & x < high # might be able to use between
}
df %>%
separate(Y,c("Low","High")) -> new_df
new_df[new_df %>%
mutate(Logi=find_number(200,high = High,low=Low)) %>%
pull("Logi"),]
Low High z
1 100 800 a
2 150 600 b
3 200 300 c
EDIT: An attempt to automate this process. Using NSE might be a much better option since that would eliminate the need to have the exact same names as in this answer. In other words, redefine the function with a data and column name arguments. For now:
find_number <- function(x){
new_df[new_df %>%
mutate(Logi=x >= Low & x< High,
isMax=ifelse(High==max(High)
& x>High,
TRUE,Logi)) %>%
pull("isMax"),]
}
find_number(12500)
Low High z
5 4000 12000 e
Data:
new_df<-structure(list(Low = c(100, 150, 200, 400, 4000), High = c(800,
600, 300, 600, 12000), z = c("a", "b", "c", "d", "e")), class = "data.frame", row.names = c(NA,
-5L))

Remove rows from a dataframe that match two columns in another dataframe R

I am struggling to remove rows from a data frame in R, where values from different columns match two values from different columns in a second data frame.
For example, given the following pseudo-data:
ID1 <- c(5,10,6)
ID2 <- c(3,5,4)
Value <- rnorm(3)
DF1 <- data.frame(ID1, ID2, Value)
x <- c()
y <- c()
z <- c()
for (i in 1:10){
a <- rep(i, 10)
b <- c(1:10)
c <- rnorm(10)
x <- c(x, a)
y <- c(y, b)
z <- c(z, c)
}
DF2 <- data.frame(x, y, z)
I would like to remove the rows from DF2 where the combination of x and y matches ID1 and ID2 from DF1 (ie x = 5 and y = 3, x = 10 and y = 5, x = 6 and y = 4, but also x = 3 and y = 5, x = 5 and y = 10, x = 4 and y = 6).
Make exclude list
excl <- data.frame(
x = c(DF1$ID1, DF1$ID2),
y = c(DF1$ID2, DF1$ID1))
Then use anti join:
library(dplyr)
anti_join(DF2, excl, by = c("x", "y"))
Or using paste as suggested in the comments:
DF2[! paste(DF2$x, DF2$y) %in%
c(paste(DF1$ID1, DF1$ID2),
paste(DF1$ID2, DF1$ID1)), ]
Another option by using #zx8754's excl and match_df function of plyr package
library(plyr)
DF2[-as.numeric(rownames(match_df(DF2,excl))),]

How to melt a dataframe with measured variables and associated standard deviations in two columns

I have a premade dataframe, in which each measured variable features an adjacent column with the standard deviations:
df <-
structure(list(Factor = structure(1:3, .Label = c("K", "L", "M"
), class = "factor"), A = c(52127802.82, 63410325.61, 76455661.87
), SD = c(9124562.98, 21975533.21, 9864019.36), B = c(63752980.62,
68303447.17, 73250794.15), SD.1 = c(34800000, 22600000, 6090000
), C = c(103512032.04, 65074190.8, 92686982.97), SD.2 = c(23900000,
20800000, 38300000), D = c(100006463.22, NA, 37406494.3)), .Names = c("Factor",
"A", "SD", "B", "SD.1", "C", "SD.2", "D"), class = "data.frame", row.names = c(NA,
-3L))
(SD.1, SD.2 were auto-renamed; originally they were all called "SD").
I want to melt into long format by factor:
library(reshape)
df.melt <- melt(df, id.vars="Factor").
However, I would like to have the melted object to keep the SD columns attached to their associated columns:
Factor Variable value value.sd
K A 52127802.82 9124562
So, i can call geom_errorbar(ymin=sd.value, ymax=sd.value) in ggplot(df.melt, aes(Factor, value)) + geom_bar(stat="identity") + facet_wrap(~variable).
Is that possible, even with the different row.names for SD?
First, I would drop df$D from the dataset because I think this is an error via df$D <- NULL:
# Factor A SD B SD.1 C SD.2
# 1 K 52127803 9124563 63752981 34800000 103512032 23900000
# 2 L 63410326 21975533 68303447 22600000 65074191 20800000
# 3 M 76455662 9864019 73250794 6090000 92686983 38300000
Then, I would rename the columns (this looks more complicated than it is and I encourage feedback/suggestions that would make this part more straightforward) -- the reason I am renaming the columns is so that I can use separate and spread from the package tidyr:
names(df)[-1][seq(2, length(names(df)) - 1, 2)] <- paste0(names(df)[-1][seq(1, length(names(df)) - 1, 2)], "-SD")
names(df)[-1][seq(1, length(names(df)) - 1, 2)] <- paste0(names(df)[-1][seq(1, length(names(df)) - 1, 2)], "-measure")
df
# Factor A-measure A-SD B-measure B-SD C-measure C-SD
# 1 K 52127803 9124563 63752981 34800000 103512032 23900000
# 2 L 63410326 21975533 68303447 22600000 65074191 20800000
# 3 M 76455662 9864019 73250794 6090000 92686983 38300000
This enables me to make df_clean:
df_clean <- df %>%
gather(measure, value, -Factor) %>%
separate(measure, c("measure_letter", "temp_var")) %>%
spread(temp_var, value)
df_clean
# Factor measure_letter measure SD
# 1 K A 52127803 9124563
# 2 K B 63752981 34800000
# 3 K C 103512032 23900000
# 4 L A 63410326 21975533
# 5 L B 68303447 22600000
# 6 L C 65074191 20800000
# 7 M A 76455662 9864019
# 8 M B 73250794 6090000
# 9 M C 92686983 38300000
Now that our dataset is clean/tidy, we can plot accordingly:
library(ggplot2)
ggplot(df_clean, aes(x = Factor, y = measure, fill = Factor)) +
geom_bar(stat = "identity") +
geom_errorbar(aes(ymin = measure - SD, ymax = measure + SD)) +
facet_wrap(~ measure_letter)

Resources