I have a data frame with blocks of values with 0 and 1 and NAs, for example:
mydata <- data.frame(a = c(0,0,0,1,1,1,0,0,0,1,1,NA,NA,NA), b = c(0,0,1,1,1,1,0,0,1,1,0,NA,NA,NA))
what I want is to obtain, for each variable, the index of start and end of each block 1, this will the desired result:
mydata <- data.frame(a = c(4,6,10,11), b = c(3,6,9,10))
How can I code it?
You may try
apply(mydata, 2, function(x){
y <- rle(x == 1)
z <- c(cumsum(y$lengths)[which(y$values)], cumsum(y$lengths)[which(y$values) - 1] + 1)
return(sort(z))
})
a b
[1,] 4 3
[2,] 6 6
[3,] 10 9
[4,] 11 10
Since you stated that you only have 0,1,NA you could also use str_locate:
library(tidyverse)
map_df(mydata, ~c(t(str_locate_all(paste(., collapse = ''), '1+')[[1]])))
# A tibble: 4 x 2
a b
<int> <int>
1 4 3
2 6 6
3 10 9
4 11 10
You could also arrange it in start end format:
map_df(mydata, ~as_tibble(str_locate_all(paste(., collapse = ''), '1+')[[1]]), .id='grp')
# A tibble: 4 x 3
grp start end
<chr> <int> <int>
1 a 4 6
2 a 10 11
3 b 3 6
4 b 9 10
We can try diff + cumsum to generate grouping info and then use range to get the range of block
list2DF(
lapply(
mydata,
function(x) {
unlist(
by(
v <- which(x == 1),
cumsum(c(0, diff(v) != 1)),
range
)
)
}
)
)
which gives
a b
1 4 3
2 6 6
3 10 9
4 11 10
Another option is using aggregate
aggregate(
. ~ col,
data.frame(
which(mydata == 1, arr.ind = TRUE)
),
function(v) {
by(
v,
cumsum(c(0, diff(v) != 1)),
range
)
},
simplify = FALSE
)
which gives
col row
1 1 4, 6, 10, 11
2 2 3, 6, 9, 10
Related
This question already has answers here:
Dictionary style replace multiple items
(11 answers)
Closed 1 year ago.
Another thread solved a similar problem very nicely
But what i would like to do is get rid of some redundancy in my similar problem.
Using their example:
df <- data.frame(name = rep(letters[1:3], each = 3), foo=rep(1:9),var1 = letters[1:3], var2 = rep(3:5, each = 3))
creates:
df
name foo var1 var2
1 a 1 a 3
2 a 2 a 3
3 a 3 a 3
4 b 4 b 4
5 b 5 b 4
6 b 6 b 4
7 c 7 c 5
8 c 8 c 5
9 c 9 c 5
But what do i need to do to replace multiple characters with unique values?
a=1
b=2
c=3
I tried:
df[,c(4,6)] <- lapply(df[,c(4,6)], function(x) replace(x,x %in% "a", 1),
replace(x,x %in% "b", 2),
replace(x,x %in% "c", 3))
and
z<- c("a","b","c")
y<- c(1,2,3)
df[,c(1,3)] <- lapply(df[,c(1,3)], function(x) replace(x,x %in% z, y))
But neither seem to work.
Thanks.
You can use dplyr::recode
df <- data.frame(name = rep(letters[1:3], each = 3), foo=rep(1:9),var1 = letters[1:3], var2 = rep(3:5, each = 3))
library(dplyr, warn.conflicts = FALSE)
df %>%
mutate(across(c(name, var1), ~ recode(., a = 1, b = 2, c = 3)))
#> name foo var1 var2
#> 1 1 1 1 3
#> 2 1 2 2 3
#> 3 1 3 3 3
#> 4 2 4 1 4
#> 5 2 5 2 4
#> 6 2 6 3 4
#> 7 3 7 1 5
#> 8 3 8 2 5
#> 9 3 9 3 5
Created on 2021-10-19 by the reprex package (v2.0.1)
Across will apply the function defined by ~ recode(., a = 1, b = 2, c = 3) to both name and var1.
Using ~ and . is another way to define a function in across. This function is equivalent to the one defined by function(x) recode(x, a = 1, b = 2, c = 3), and you could use that code in across instead of the ~ form and it would give the same result. The only name I know for this is what it's called in ?across, which is "purrr-style lambda function", because the purrr package was the first to use formulas to define functions in this way.
If you want to see the actual function created by the formula, you can look at rlang::as_function(~ recode(., a = 1, b = 2, c = 3)), although it's a little more complex than the one above to support the use of ..1, ..2 and ..3 which are not used here.
Now that R supports the easier way of defining functions below, this purrr-style function is maybe no longer useful, it's just an old habit to write it that way.
df <- data.frame(name = rep(letters[1:3], each = 3), foo=rep(1:9),var1 = letters[1:3], var2 = rep(3:5, each = 3))
library(dplyr, warn.conflicts = FALSE)
df %>%
mutate(across(c(name, var1), \(x) recode(x, a = 1, b = 2, c = 3)))
#> name foo var1 var2
#> 1 1 1 1 3
#> 2 1 2 2 3
#> 3 1 3 3 3
#> 4 2 4 1 4
#> 5 2 5 2 4
#> 6 2 6 3 4
#> 7 3 7 1 5
#> 8 3 8 2 5
#> 9 3 9 3 5
Created on 2021-10-19 by the reprex package (v2.0.1)
A simple for loop would do the trick:
for (i in 1:length(z)) {
df[df==z[i]] <- y[i]
}
df
name foo var1 var2
1 1 1 1 3
2 1 2 2 3
3 1 3 3 3
4 2 4 1 4
5 2 5 2 4
6 2 6 3 4
7 3 7 1 5
8 3 8 2 5
9 3 9 3 5
You could use a lookup vector combined with apply:
z <- c("a","b","c")
y <- c(1,2,3)
lookup <- setNames(y, z)
df[,c(1,3)] <- apply(df[,c(1,3)], 2, function(x) lookup[x])
df
This returns
name foo var1 var2
1 1 1 1 3
2 1 2 2 3
3 1 3 3 3
4 2 4 1 4
5 2 5 2 4
6 2 6 3 4
7 3 7 1 5
8 3 8 2 5
9 3 9 3 5
If you are open to a tidyverse approach:
library(tidyverse)
df_new <- df %>%
mutate(across(c(var1, name), ~case_when(. == 'a' ~ 1,
. == 'b' ~ 2,
. == 'c' ~ 3)))
df_new
name foo var1 var2
1 1 1 1 3
2 1 2 2 3
3 1 3 3 3
4 2 4 1 4
5 2 5 2 4
6 2 6 3 4
7 3 7 1 5
8 3 8 2 5
9 3 9 3 5
Note, this code works only if you change all values of your column. E.g. if there was a „d“ in your var1 column that you don‘t tuen into a number, it would be changed to NA.
# Import data: df => data.frame
df <- data.frame(name = rep(letters[1:3], each = 3), foo=rep(1:9),var1 = letters[1:3], var2 = rep(3:5, each = 3))
# Function performing a mapping replacement:
# replaceMultipleValues => function()
replaceMultipleValues <- function(df, mapFrom, mapTo){
# Extract the values in the data.frame:
# dfVals => named character vector
dfVals <- unlist(df)
# Get all values in the mapping & data
# and assign a name to them: tmp1 => named character vector
tmp1 <- c(
setNames(mapTo, mapFrom),
setNames(dfVals, dfVals)
)
# Extract the unique values:
# valueMap => named character vector
valueMap <- tmp1[!(duplicated(names(tmp1)))]
# Recode the values, coerce vectors to appropriate
# types: res => data.frame
res <- type.convert(
data.frame(
matrix(
valueMap[dfVals],
nrow = nrow(df),
ncol = ncol(df),
dimnames = dimnames(df)
)
)
)
# Explicitly define the returned object: data.frame => env
return(res)
}
# Recode values in data.frame:
# res => data.frame
res <- replaceMultipleValues(
df,
c("a", "b", "c"),
c("1", "2", "3")
)
# Print data.frame to console:
# data.frame => stdout(console)
res
I have a big data frame from a survey. There is some statements where I need to use revere coding, hence I need to change values in few columns. I have tried below code (where x represents the column where I want to make the changes)
df$x <- replace( df$x, 1=7, 2=6, 3=5, 5=3, 6=2, 7=1)
But this did not work. Every help is much appreciated.
If your column has only 1-7 values you can subtract those values from 8 to reverse the values.
set.seed(123)
df <- data.frame(x = sample(7, 10, replace = TRUE))
df$y <- 8 - df$x
#Or maybe more general
#df$y <- max(df$x) + 1 - df$x
df
# x y
#1 7 1
#2 7 1
#3 3 5
#4 6 2
#5 3 5
#6 2 6
#7 2 6
#8 6 2
#9 3 5
#10 5 3
You could try case_when from package dplyr. The syntax is very clean.
library(dplyr)
df %>%
mutate(x=case_when(
x == 1 ~ 7,
x == 2 ~ 6,
x == 3 ~ 5,
x == 6 ~ 2,
x == 7 ~ 1,
TRUE ~ as.numeric(x)
))
DATA
set.seed(1)
df <- data.frame(x = sample(7, 10, replace = TRUE))
df
The solution above overwrites the varaible x. To compare result, I created a new_x variable with the replaced data:
df %>%
mutate(new_x=case_when(
x == 1 ~ 7,
x == 2 ~ 6,
x == 3 ~ 5,
x == 6 ~ 2,
x == 7 ~ 1,
TRUE ~ as.numeric(x)
))
x new_x
1 1 7
2 4 4
3 7 1
4 1 7
5 2 6
6 5 5
7 7 1
8 3 5
9 6 2
10 2 6
One way you can replace values is using which:
df$x[which(df$x=1)] <- 7 # this replaces 1 with 7
Another way is to use ifelse:
df$x <- ifelse(df$x == 1,7,ifelse(df$x == 2,6,ifelse....)) # replaces 1 with 7, 2 with 6 and so on..
An option with which.max
library(dplyr)
df %>%
mutate(y = x[which.max(x)] - x + 1)
I want to create a new column z based on the values of x and y. If x>y, z=y otherwise z=x.
x y
3 4
5 2
6 6
1 7
9 4
Output required:
x y z
3 4 3
5 2 2
6 6 6
1 7 1
9 4 4
You can use ifelse :
df$z <- with(df, ifelse(x > y, y, x))
#Or without with
#df$z <- ifelse(df$x > df$y, df$y, df$x)
df
# x y z
#1 3 4 3
#2 5 2 2
#3 6 6 6
#4 1 7 1
#5 9 4 4
In dplyr, you can use if_else which is same as above or case_when which is helpful when you have to list down multiple conditions.
library(dplyr)
df %>%
mutate(z = case_when(x > y ~ y,
TRUE ~x))
If I get it correctly, you are looking for minimum value out of several columns. You can use pmin function:
library(dplyr)
df <- data.frame(x = c(3,5,6,1,9),
y = c(4,2,6,7,4))
df <- df %>% mutate(z = pmin(x, y))
result:
> df
x y z
1 3 4 3
2 5 2 2
3 6 6 6
4 1 7 1
5 9 4 4
It will count minimum value in a data frame row wise and will simplify syntax if you would like to include more than 2 columns:
df <- data.frame(x = c(3, 5, 6, 1, 9),
y = c(4, 2, 6, 7, 4),
a = c(2, 5, 7, 3, 3))
df <- df %>% mutate(z = pmin(x, y, a))
result:
> df
x y a z
1 3 4 2 2
2 5 2 5 2
3 6 6 7 6
4 1 7 3 1
5 9 4 3 3
Similar to another answer but using data.table and pmin:
library(data.table)
dt <- data.table(x = c(3,5,6,1,9),
y = c(4,2,6,7,4))
dt[, z:= pmin(x,y)]
dt
# x y z
# 1: 3 4 3
# 2: 5 2 2
# 3: 6 6 6
# 4: 1 7 1
# 5: 9 4 4
Function pmin returns the parallel minima (https://www.rdocumentation.org/packages/mc2d/versions/0.1-17/topics/pmin)
Another option with fifelse in data.table
library(data.table)
setDT(dt)[, z := fifelse(x > y, y, x)]
I need to create a function which depends on 2 variables and a table(?) (x,y,table). First this function need to loop through every row in a table and dependending of 'X' column value (1 or 0) take a different action:
When x column = 1 then copy the row 'y' times. Then in a nested loop create a column numerating from 1 to y the times the row was copied.
When x column = 0 then just leave the row how it is.
An example of the input would be
the output would be
I tried doing the next code but honestly im not very good with loops.
My_function <- function(x,y,table) {
for (i in 1:nrow(table)) {
if(table[,which(colnames(table) = as.character(x)] == 1){
dummy = table[i,]
final_dummy = NULL
for(j in 1:dummy$y){
dummy_2 = dummy
final_dummy = rbind(final_dummy,dummy_2)
}
} else if(table[,which(colnames(table) = as.character(x)] == 0){
table[i,]
}
}
}
You can also use:
library(tidyverse)
df <- tibble(col = c(letters[1:3]), x = c(1, 0, 1), y = c(2, 3, 4))
df %>%
mutate(uncY = if_else(x == 0, 1, y)) %>%
uncount(uncY, .id = "id")
# A tibble: 7 x 4
col x y id
<chr> <dbl> <dbl> <int>
1 a 1 2 1
2 a 1 2 2
3 b 0 3 1
4 c 1 4 1
5 c 1 4 2
6 c 1 4 3
7 c 1 4 4
You can write the function as :
My_function <- function(data) {
out <- data[with(data, rep(seq_along(col), pmax(colx * coly, 1))), ]
rownames(out) <- NULL
out$num <- with(out, ave(colx, col, FUN = seq_along))
return(out)
}
and call it
My_function(df)
# col colx coly num
#1 a 1 2 1
#2 a 1 2 2
#3 b 0 3 1
#4 c 1 4 1
#5 c 1 4 2
#6 c 1 4 3
#7 c 1 4 4
data
df <- data.frame(col = letters[1:3], colx = c(1, 0, 1), coly = c(2, 3, 4))
I have the following dataframe
library(tidyverse)
x <- c(1,2,3,NA,NA,4,5)
y <- c(1,2,3,5,5,4,5)
z <- c(1,1,1,6,7,7,8)
df <- data.frame(x,y,z)
df
x y z
1 1 1 1
2 2 2 1
3 3 3 1
4 NA 5 6
5 NA 5 7
6 4 4 7
7 5 5 8
I would like to update the dataframe according to the following conditions
If z==1, update to x=1, else leave the current value for x
If z==1, update to y=2, else leave the current value for y
The following code does the job fine
df %>% mutate(x=if_else(z==1,1,x),y=if_else(z==1,2,y))
x y z
1 1 2 1
2 1 2 1
3 1 2 1
4 NA 5 6
5 NA 5 7
6 4 4 7
7 5 5 8
However, I have to add if_else statement for x and y mutate functions. This has the potential to make my code complicated and hard to read. To give you a SQL analogy, consider the following code
UPDATE df
SET x= 1, y= 2
WHERE z = 1;
I would like to achieve the following:
Specify the update condition ahead of time, so I don't have to repeat it for every mutate function
I would like to avoid using data.table or base R. I am using dplyr so I would like to stick to it for consistency
Using mutate_cond posted at dplyr mutate/replace several columns on a subset of rows we can do this:
df %>% mutate_cond(z == 1, x = 1, y = 2)
giving:
x y z
1 1 2 1
2 1 2 1
3 1 2 1
4 NA 5 6
5 NA 5 7
6 4 4 7
7 5 5 8
sqldf
Of course you can directly implement it in SQL with sqldf -- ignore the warning message that the backend RSQLite issues.
library(sqldf)
sqldf(c("update df set x = 1, y = 2 where z = 1", "select * from df"))
base R
It straight-forward in base R:
df[df$z == 1, c("x", "y")] <- list(1, 2)
library(dplyr)
df %>%
mutate(x = replace(x, z == 1, 1),
y = replace(y, z == 1, 2))
# x y z
#1 1 2 1
#2 1 2 1
#3 1 2 1
#4 NA 5 6
#5 NA 5 7
#6 4 4 7
#7 5 5 8
In base R
transform(df,
x = replace(x, z == 1, 1),
y = replace(y, z == 1, 2))
If you store the condition in a variable, you don't have to type it multiple times
condn = (df$z == 1)
transform(df,
x = replace(x, condn, 1),
y = replace(y, condn, 2))
Here is one option with map2. Loop through the 'x', 'y' columns of the dataset, along with the values to change, apply case_when based on the values of 'z' if it is TRUE, then return the new value, or else return the same column and bind the columns with the original dataset
library(dplyr)
library(purrr)
map2_df(df %>%
select(x, y), c(1, 2), ~ case_when(df$z == 1 ~ .y, TRUE ~ .x)) %>%
bind_cols(df %>%
select(z), .) %>%
select(names(df))
Or using base R, create a logical vector, use that to subset the rows of columns 'x', 'y' and update by assigning to a list of values
i1 <- df$z == 1
df[i1, c('x', 'y')] <- list(1, 2)
df
# x y z
#1 1 2 1
#2 1 2 1
#3 1 2 1
#4 NA 5 6
#5 NA 5 7
#6 4 4 7
#7 5 5 8
The advantage of both the solutions are that we can pass n number of columns with corresponding values to pass and not repeating the code
If you have an SQL background, you should really check out data.table:
library(data.table)
dt <- as.data.table(df)
set(dt, which(z == 1), c('x', 'y'), list(1, 2))
dt
# or perhaps more classic syntax
dt <- as.data.table(df)
dt
# x y z
#1: 1 1 1
#2: 2 2 1
#3: 3 3 1
#4: NA 5 6
#5: NA 5 7
#6: 4 4 7
#7: 5 5 8
dt[z == 1, `:=`(x = 1, y = 2)]
dt
# x y z
#1: 1 2 1
#2: 1 2 1
#3: 1 2 1
#4: NA 5 6
#5: NA 5 7
#6: 4 4 7
#7: 5 5 8
The last option is an update join. This is great if you have the lookup data already done upfront:
# update join:
dt <- as.data.table(df)
dt_lookup <- data.table(x = 1, y = 2, z = 1)
dt[dt_lookup, on = .(z), `:=`(x = i.x, y = i.y)]
dt