I am trying to create a script that allows me to transform my data. At the moment I have two different scripts depending on the two ways I usually get the data; either in presence/absence form or with the abundance class written next to each case.
Script 1:
mutate(data, abundance= case_when(data[,2]== 1 ~ 1,
data[,3]==1 ~ 2,
data[,4]==1 ~ 3,
data[,5]==1 ~ 3,
data[,6]==1 ~ 4,
data[,7]==1 ~ 4,
data[,8]==1 ~ 4,
data[,9]== 1 ~ 4,
data[,10]==1 ~ 5,
data[,11]==1 ~ 5,
data[,12]==1 ~ 5,
data[,13]==1 ~ 5,
data[,14]==1 ~ 5,
TRUE ~ NA_real_))
Script 2:
mutate(data, abundance= case_when(data[,2]=="< 1" ~ 2,
data[,2]=="1 - <5%" ~ 3,
data[,2]=="5 - <10%" ~ 3,
data[,2]=="10 - <20%" ~ 4,
data[,2]=="20 - <30%" ~ 4,
data[,2]=="30 - <40%" ~ 4,
data[,2]=="40 - <50%" ~ 4,
data[,2]=="50 - <60%" ~ 5,
data[,2]=="60 - <70%" ~ 5,
data[,2]=="70 - <80%" ~ 5,
data[,2]=="80 - <90%" ~ 5,
data[,2]=="90 - 100%" ~ 5,
TRUE ~ NA_real_)))
Therefore, my data have either two columns or fourteen. I have been thinking about how I could make r distinguish between these two possibilities, and although this option works, the results are not as expected since it adds the value of 1 to all cases.
mutate(data, abundance= ifelse(ncol(data)>2, case_when(data[,2]== 1 ~ 1, data[,3]==1 ~ 2,
data[,4]==1 ~ 3,
data[,5]==1 ~ 3,
data[,6]==1 ~ 4,
data[,7]==1 ~ 4,
data[,8]==1 ~ 4,
data[,9]== 1 ~ 4,
data[,10]==1 ~ 5,
data[,11]==1 ~ 5,
data[,12]==1 ~ 5,
data[,13]==1 ~ 5,
data[,14]==1 ~ 5,
TRUE ~ NA_real_), case_when(data[,2]=="< 0,1" ~ 1,
data[,2]=="< 1" ~ 2,
data[,2]=="1 - <5%" ~ 3,
data[,2]=="5 - <10%" ~ 3,
data[,2]=="10 - <20%" ~ 4,
data[,2]=="20 - <30%" ~ 4,
data[,2]=="30 - <40%" ~ 4,
data[,2]=="40 - <50%" ~ 4,
data[,2]=="50 - <60%" ~ 5,
data[,2]=="60 - <70%" ~ 5,
data[,2]=="70 - <80%" ~ 5,
data[,2]=="80 - <90%" ~ 5,
data[,2]=="90 - 100%" ~ 5,
TRUE ~ NA_real_)))
I hope you can understand me and thank you very much for your help.
The problem with your solution is in the ifelse. The condition has to be of the same length as the data. A naive solution is to replicate the condition to the length of the data ifelse(rep(ncol(data) > 2, nrow(data)), ..., ...)
Below is the solution with some dummy data:
library(tidyverse)
set.seed(123)
# dummy data
a <- data.frame(V1 = round(runif(10)*10), V2= round(runif(10)*10))
b <- cbind(a,a)
colnames(b) <- paste0("V", 1:4)
# test of "case_when" for each input data format
mutate(b, abundance = case_when(b[,1] == 1 ~ 1,
b[,2] == 1 ~ 2,
TRUE ~ NA_real_))
#> V1 V2 V3 V4 abundance
#> 1 3 10 3 10 NA
#> 2 8 5 8 5 NA
#> 3 4 7 4 7 NA
#> 4 9 6 9 6 NA
#> 5 9 1 9 1 2
#> 6 0 9 0 9 NA
#> 7 5 2 5 2 NA
#> 8 9 0 9 0 NA
#> 9 6 3 6 3 NA
#> 10 5 10 5 10 NA
mutate(a, abundance = case_when(a[,2] == 1 ~ 1,
a[,2] == 2 ~ 2))
#> V1 V2 abundance
#> 1 3 10 NA
#> 2 8 5 NA
#> 3 4 7 NA
#> 4 9 6 NA
#> 5 9 1 1
#> 6 0 9 NA
#> 7 5 2 2
#> 8 9 0 NA
#> 9 6 3 NA
#> 10 5 10 NA
# Solution
data <- a
mutate(data, abundance = ifelse(rep(ncol(data) > 2, nrow(data)),
case_when(data[,1] == 1 ~ 1,
data[,2] == 1 ~ 2,
TRUE ~ NA_real_),
case_when(data[,2] == 1 ~ 1,
data[,2] == 2 ~ 2)))
#> V1 V2 abundance
#> 1 3 10 NA
#> 2 8 5 NA
#> 3 4 7 NA
#> 4 9 6 NA
#> 5 9 1 1
#> 6 0 9 NA
#> 7 5 2 2
#> 8 9 0 NA
#> 9 6 3 NA
#> 10 5 10 NA
data <- b
mutate(data, abundance = ifelse(rep(ncol(data) > 2, nrow(data)),
case_when(data[,1] == 1 ~ 1,
data[,2] == 1 ~ 2,
TRUE ~ NA_real_),
case_when(data[,2] == 1 ~ 1,
data[,2] == 2 ~ 2)))
#> V1 V2 V3 V4 abundance
#> 1 3 10 3 10 NA
#> 2 8 5 8 5 NA
#> 3 4 7 4 7 NA
#> 4 9 6 9 6 NA
#> 5 9 1 9 1 2
#> 6 0 9 0 9 NA
#> 7 5 2 5 2 NA
#> 8 9 0 9 0 NA
#> 9 6 3 6 3 NA
#> 10 5 10 5 10 NA
Related
I receive dataframe but the number of columns (V) is increasing regularly (actually V49). This example takes only V1 to V7. I have a lot of Nas and 3 others columns ID, REP and all who are used next step.
ID <- c("A", "B", "B3", "E4", "JE5", "L6")
V1 <- c(3, 5, 1, 3, 7, 1)
V2 <- c(6, 4, 2, 7, 6, 2)
V3 <- c(6, 5, 2, 7, 6, 3)
V4 <- c(6, 7, 1, 7, 6, 3)
V5 <- c(NA, NA, 2, 7, NA, 3)
V6 <- c(NA, NA, 2, 7, NA, 3)
V7 <- c(NA, NA, NA, 7, NA, 3)
REP <- c(4, 4, 6, 7, 4, 7)
all <- c(6, 5, 2, 7, 6, 3)
variation <- c(0, 0, 0, 0, 0, 0)
df <- data.frame(ID, V1, V2, V3, V4, V5, V6, V7, REP, all, variation)
I want this result : add variation == 2 when Vi+1 - Vi < 0 or > 1.
i = V1 to Vmax: Vmax is different for each individual V4 at A and B V6 at B3....
# ID V1 V2 V3 V4 V5 V6 V7 REP all variation
# 1 A 3 6 6 6 NA NA NA 4 6 0
# 2 B 5 4 5 7 NA NA NA 4 5 0
# 3 B3 1 2 2 1 2 2 NA 6 2 0
# 4 E4 3 7 7 7 7 7 7 7 7 0
# 5 JE5 7 6 6 6 NA NA NA 4 6 0
# 6 L6 1 2 3 3 3 3 3 7 3 0
I try with loop but it takes long time so I try with case_when(). It's work but every week I need to add new line because I want to see when there is a variation between the variable Vi and Vi+1.
!between(Vi+1 - Vi, 0, 1) ~ 2,....
df <- df %>%
mutate(variation = case_when(
!between(V2 - V1, 0, 1) ~ 2,
!between(V3 - V2, 0, 1) ~ 2,
!between(V4 - V3, 0, 1) ~ 2,
!between(V5 - V4, 0, 1) ~ 2,
!between(V6 - V5, 0, 1) ~ 2,
!between(V7 - V6, 0, 1) ~ 2,
TRUE ~ as.numeric(variation)))
Is there an automatic increment for case_when or other function with apply or map to avoid writing all variables by hand
Thank you in advance.
I would pivot the data into a longer format, then just compare all the values to their lag in a single statement.
df %>%
pivot_longer(matches("V[0-9]+")) %>%
group_by(ID) %>%
summarize(variation = if (any(!between(value - lag(value), 0, 1), na.rm = T)) 2 else unique(variation)) %>%
right_join(select(df, -variation), by = "ID")
#> # A tibble: 6 × 11
#> ID variation V1 V2 V3 V4 V5 V6 V7 REP all
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A 2 3 6 6 6 NA NA NA 4 6
#> 2 B 2 5 4 5 7 NA NA NA 4 5
#> 3 B3 2 1 2 2 1 2 2 NA 6 2
#> 4 E4 2 3 7 7 7 7 7 7 7 7
#> 5 JE5 2 7 6 6 6 NA NA NA 4 6
#> 6 L6 0 1 2 3 3 3 3 3 7 3
I want the indices of the unselected rows when using sample() in R. Consider the following case.
df <- data.frame(id = c(1,1,2,2,3,3),
v1 = c(2,2,9,4,7,1),
v2 = c(3,5,8,5,8,5))
ss <- ceiling(0.5*nrow(df)) #size
set.seed(123)
rid <- sample(seq_len(nrow(df)),size=ss,replace=F)
Now, the rows 3,6,2 are randomly selected. Is there a way to know indices of unselected rows (1,4,5)?
Thanks!
You can use df[-rid,]:
df <- data.frame(
id = c(1, 1, 2, 2, 3, 3),
v1 = c(2, 2, 9, 4, 7, 1),
v2 = c(3, 5, 8, 5, 8, 5)
)
ss <- ceiling(0.5 * nrow(df)) # size
set.seed(123)
rid <- sample(seq_len(nrow(df)), size = ss, replace = F)
rid
#> [1] 3 6 2
df
#> id v1 v2
#> 1 1 2 3
#> 2 1 2 5
#> 3 2 9 8
#> 4 2 4 5
#> 5 3 7 8
#> 6 3 1 5
df[rid,]
#> id v1 v2
#> 3 2 9 8
#> 6 3 1 5
#> 2 1 2 5
df[-rid, ]
#> id v1 v2
#> 1 1 2 3
#> 4 2 4 5
#> 5 3 7 8
rownames(df[-rid, ])
#> [1] "1" "4" "5"
Created on 2021-11-05 by the reprex package (v2.0.1)
I have a database in R where there are some NAs in the variables. I would like to apply a logic function where the NAs would be filled with the immediately preceding value. Below is an example:
dados <- tibble::tibble(x = c(2, 3, 5, NA, 2, 1, NA, NA, 9, 3),
y = c(4, 1, 9, NA, 8, 5, NA, NA, 1, 2)
)
# A tibble: 10 x 2
x y
<dbl> <dbl>
1 2 4
2 3 1
3 5 9
4 NA NA
5 2 8
6 1 5
7 NA NA
8 NA NA
9 9 1
10 3 2
In this case, the 4th value of the variable x would be filled with a 5 and so on.
Thank you!
We could use fill from tidyr package:
ibrary(tidyr)
library(dplyr)
dados %>%
fill(c(x,y), .direction = "down")
x y
<dbl> <dbl>
1 2 4
2 3 1
3 5 9
4 5 9
5 2 8
6 1 5
7 1 5
8 1 5
9 9 1
10 3 2
We can use coalesce
library(dplyr)
dados %>%
mutate(across(x:y, ~ coalesce(., lag(.))))
# A tibble: 10 x 2
x y
<dbl> <dbl>
1 2 4
2 3 1
3 5 9
4 5 9
5 2 8
6 1 5
7 1 5
8 NA NA
9 9 1
10 3 2
library(dplyr)
dados %>%
mutate(x = case_when(is.na(x) ~ lag(x),
TRUE ~ x),
y = case_when(is.na(y) ~ lag(y),
TRUE ~ y))
The follow will only work, if the first value in a column is not NA but I leave that for the sake of clear and easy code as an execise for you we can solve this for one column as in:
library(tibble)
dados <- tibble::tibble(x = c(2, 3, 5, NA, 2, 1, NA, NA, 9, 3),
y = c(4, 1, 9, NA, 8, 5, NA, NA, 1, 2)
)
#where are the NA?
pos <- dados$x |>
is.na() |>
which()
# replace
while(any(is.na(dados$x)))
dados$x[pos] <- dados$x[pos-1]
dados
I have a dataframe like this
structure(list(a = c(1, 3, 4, 6, 3, 2, 5, 1), b = c(1, 3, 4,
2, 6, 7, 2, 6), c = c(6, 3, 6, 5, 3, 6, 5, 3), d = c(6, 2, 4,
5, 3, 7, 2, 6), e = c(1, 2, 4, 5, 6, 7, 6, 3), f = c(2, 3, 4,
2, 2, 7, 5, 2)), .Names = c("Love_ABC", "Love_CNN", "Hate_ABC", "Hate_CNN", "Love_CNBC", "Hate_CNBC"), row.names = c(NA,
8L), class = "data.frame")
I have made the following for loop
channels = c("ABC", "CNN", "CNBC")
for (channel in channels) {
dataframe <- dataframe %>%
mutate(ALL_channel = Love_channel + Hate_channel)
}
But when i run the for loop R tells me " object Love_channel" not found. Have i done something wrong in the for loop?
Here's a way with rlang. Note, reshaping the data is likely more straightforward. Non-standard evaluation (NSE) is a complicated topic.
for (channel in channels) {
DF <- DF %>%
mutate(!!sym(paste0("ALL_", channel)) := !!sym(paste0("Love_", channel)) + !!sym(paste0("Hate_", channel)))
}
DF
## Love_ABC Love_CNN Hate_ABC Hate_CNN Love_CNBC Hate_CNBC ALL_ABC ALL_CNN ALL_CNBC
## 1 1 1 6 6 1 2 7 7 3
## 2 3 3 3 2 2 3 6 5 5
## 3 4 4 6 4 4 4 10 8 8
## 4 6 2 5 5 5 2 11 7 7
## 5 3 6 3 3 6 2 6 9 8
## 6 2 7 6 7 7 7 8 14 14
## 7 5 2 5 2 6 5 10 4 11
## 8 1 6 3 6 3 2 4 12 5
This is a solution with dplyr and tidyr:
library(tidyr)
library(dplyr)
dataframe <- dataframe %>%
tibble::rowid_to_column()
dataframe %>%
pivot_longer(-rowid, names_to = c(NA, "channel"), names_sep = "_") %>%
pivot_wider(names_from = channel, names_prefix = "ALL_", values_from = value, values_fn = sum) %>%
right_join(dataframe, by = "rowid") %>%
select(-rowid)
#> # A tibble: 8 x 9
#> ALL_ABC ALL_CNN ALL_CNBC Love_ABC Love_CNN Hate_ABC Hate_CNN Love_CNBC Hate_CNBC
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 7 7 3 1 1 6 6 1 2
#> 2 6 5 5 3 3 3 2 2 3
#> 3 10 8 8 4 4 6 4 4 4
#> 4 11 7 7 6 2 5 5 5 2
#> 5 6 9 8 3 6 3 3 6 2
#> 6 8 14 14 2 7 6 7 7 7
#> 7 10 4 11 5 2 5 2 6 5
#> 8 4 12 5 1 6 3 6 3 2
The idea is to reshape it to make the sums easier. Then you can join the final result back to the initial dataframe.
start by uniquely identifying each row with a rowid.
reshape with pivot_longer so to have all values neatly in one column. In this step you also separate the names Love/Hate_channel in two and you remove the Love/Hate part (you are interested only on the channel) [that is what the NA does!].
reshape again: this time you want to get one column for each channel. In this step you also sum up what previously was Love and Hate together for each rowid and channel (that's what values_fn=sum does!). Also you add a prefix (names_prefix = "ALL_") to each new column name to have names that respect your expected final result.
with right_join you add the values back to the original dataframe. You have no need for rowid now, so you can remove it.
I am new to R, and I'd like help in finding a better way to write the following code I've written. Any help would be appreciated.
df$rank[between(df$score,0,1.2)] <- 1
df$rank[between(df$score,1.2,2.1)] <- 2
df$rank[between(df$score,2.1,2.9)] <- 3
df$rank[between(df$score,2.9,3.7)] <- 4
df$rank[between(df$score,3.7,4.5)] <- 5
df$rank[between(df$score,4.5,5.4)] <- 6
You can use cut:
df$rank <- cut(x = df$score,c(0,1.2,2.1,2.9,3.7,4.5,5.4,Inf),FALSE)
library(dplyr)
set.seed(1234)
df <- data.frame(rank = rep(0, 15),
score = runif(15, 0, 6))
df
#> rank score
#> 1 0 0.68222047
#> 2 0 3.73379643
#> 3 0 3.65564840
#> 4 0 3.74027665
#> 5 0 5.16549230
#> 6 0 3.84186363
#> 7 0 0.05697454
#> 8 0 1.39530304
#> 9 0 3.99650255
#> 10 0 3.08550685
#> 11 0 4.16154775
#> 12 0 3.26984901
#> 13 0 1.69640150
#> 14 0 5.54060091
#> 15 0 1.75389504
df %>%
mutate(rank = case_when(between(score, 0, 1.2) ~ 1,
between(score, 1.2, 2.1) ~ 2,
between(score, 2.1, 2.9) ~ 3,
between(score, 2.9, 3.7) ~ 4,
between(score, 3.7, 4.5) ~ 5,
between(score, 4.5, 5.4) ~ 6))
#> rank score
#> 1 1 0.68222047
#> 2 5 3.73379643
#> 3 4 3.65564840
#> 4 5 3.74027665
#> 5 6 5.16549230
#> 6 5 3.84186363
#> 7 1 0.05697454
#> 8 2 1.39530304
#> 9 5 3.99650255
#> 10 4 3.08550685
#> 11 5 4.16154775
#> 12 4 3.26984901
#> 13 2 1.69640150
#> 14 NA 5.54060091
#> 15 2 1.75389504
Created on 2018-04-29 by the reprex package (v0.2.0).
As you didn't add a reproducible example, I created a little one (but keep in mind you should always add an example).
Using ifelse from base you could do this way:
df = data.table(rank = c(1.2, 3.3, 2.5, 3.7, 5.8, 6, 3, 1.1, 0.5))
df$rank2 = ifelse(df$rank>0 & df$rank<=1.2, 1,
ifelse(df$rank>1.2 & df$rank<=2.1, 2,
ifelse(df$rank>2.1 & df$rank<=2.9, 3,
ifelse(df$rank>2.9 & df$rank<=3.7, 4,
ifelse(df$rank>3.7 & df$rank<=4.5, 5, 6)))))
The last ifelse should be your maximun rank value, so the "no" argument will be the last range.
If this is a reocurring problem you should create a function.
Hope it helps.