update names based on columns R - r

Following this question (update names based on columns), another thing that I want to ask
df <- data.frame(name1 = c("a", "a", "a", "a", 'a', NA, NA, NA,NA),
name2 = c("b", "b", "b", "b", "c", NA, NA, NA,NA),
name3 = c("b", "b", "b", "b", "c", "a", "a", "a", "f"))
df
name1 name2 name3
1 a b b
2 a b b
3 a b b
4 a b b
5 a c c
6 <NA> <NA> a
7 <NA> <NA> a
8 <NA> <NA> a
9 <NA> <NA> f
Now, I want to keep f while replacing a by b.
-Desired output
name1 name2 name3
1 a b b
2 a b b
3 a b b
4 a b b
5 a c c
6 <NA> <NA> b
7 <NA> <NA> b
8 <NA> <NA> b
9 <NA> <NA> f
The code from comments of #Rui and #TarJae
df %>%
mutate(name3 = case_when(
any(name1 == "a") & is.na(name2) ~ "b",
TRUE ~ name3
))
However, in this case, this does not work because I call NA from name2.
Any sugesstions for me?

If you just want to keep f, how about this?
edit except a and b will be not be changed.
df %>%
mutate(name3 = case_when(
!(name3 %in% c("a", "b")) ~ name3,
any(name1 == "a") & is.na(name2) ~ "b",
TRUE ~ name3
))
name1 name2 name3
1 a b b
2 a b b
3 a b b
4 a b b
5 a c c
6 <NA> <NA> b
7 <NA> <NA> b
8 <NA> <NA> b
9 <NA> <NA> f

Generalizing the solution #Tjn25, one can do the following
f <- function(x) ifelse(x == "a", "b", x)
data.frame(lapply(df, f))

If you just want to change the names3 column values that are a to b
then you could use
df$name3[df$name3 == 'a'] <- 'b'

Related

update names based on columns

I would like update the names based on two columns
My example has 3 originial columns
df <- data.frame(name1 = c("a", "a", "a", "a", 'a', NA, NA, NA),
name2 = c("b", "b", "b", "b", "c", NA, NA, NA),
name3 = c("b", "b", "b", "b", "c", "a", "a", "a"))
df
name1 name2 name3
1 a b b
2 a b b
3 a b b
4 a b b
5 a c c
6 <NA> <NA> a
7 <NA> <NA> a
8 <NA> <NA> a
I would like to update column name3 (or even create a new column) saying that if name1 == a, and name2 == NA, then the a character in name3 will be replaced by b in column name2.
My desired output something like
name1 name2 name3
1 a b b
2 a b b
3 a b b
4 a b b
5 a c c
6 <NA> <NA> b
7 <NA> <NA> b
8 <NA> <NA> b
So far, i am using this df %>% mutate(name3 = ifelse(name1 == "a" & is.na(name2), "b", name3)), but now NA appeared. Any suggestions for this?
Base R
df$name3 <- ifelse(any(df$name1 == "a") & is.na(df$name2), "b", df$name3)
dplyr
library(dplyr)
df %>%
mutate(name3 = case_when(
any(name1 == "a") & is.na(name2) ~ "b",
TRUE ~ name3
))
# name1 name2 name3
#1 a b b
#2 a b b
#3 a b b
#4 a b b
#5 a c c
#6 <NA> <NA> b
#7 <NA> <NA> b
#8 <NA> <NA> b
We can replace == with %in% to eliminate the NAs, because R evaluates NA %in% x to FALSE, but NA==x to NA
df %>% mutate(name3 = ifelse(name1 %in% 'a' & is.na(name2), 'b', name3))
We could use a case_when or ifelse statement:
library(dplyr)
df %>%
mutate(name3 = case_when(any(name1 %in% "a") &
is.na(name2) ~ "b",
TRUE ~ name3))
or:
df %>%
mutate(name3 = ifelse(any(name1 %in% "a") &
is.na(name2), "b", name3))
name1 name2 name3
1 a b b
2 a b b
3 a b b
4 a b b
5 a c c
6 <NA> <NA> b
7 <NA> <NA> b
8 <NA> <NA> b

Count number of element for each row in a matrix [duplicate]

This question already has answers here:
Count number of values in row using dplyr
(5 answers)
Counting number of instances of a condition per row R [duplicate]
(1 answer)
Closed 2 years ago.
Hello I have a matrix such as :
COL1 COL2 COL3
A "A" "B" NA
B "B" "B" "C"
C NA NA NA
D "B" "B" "B"
E NA NA "C"
F "A" "A" "C"
and I would liek for each row (A,B,C,D etc) get the number of letters being A or B
exemple :
Nb
A 2
B 2
C 0
D 3
E 0
F 2
does someone have an idea ?
another way is to use sapply:
df$n <- sapply(1:nrow(df), function(i) sum((df[i,] %in% c('A', 'B'))))
# COL1 COL2 COL3 n
# A A B <NA> 2
# B B B C 2
# C <NA> <NA> <NA> 0
# D B B B 3
# E <NA> <NA> C 0
# F A A C 2
You can achieve the same output by using purrr::map_dbl as well. Just replace sapply with map_dbl.
You can try a base R solution with apply():
#Base R
df$Var <- apply(df,1,function(x) length(which(!is.na(x) & x %in% c('A','B'))))
Output:
COL1 COL2 COL3 Var
A A B <NA> 2
B B B C 2
C <NA> <NA> <NA> 0
D B B B 3
E <NA> <NA> C 0
F A A C 2
Some data used:
#Data
df <- structure(list(COL1 = c("A", "B", NA, "B", NA, "A"), COL2 = c("B",
"B", NA, "B", NA, "A"), COL3 = c(NA, "C", NA, "B", "C", "C")), row.names = c("A",
"B", "C", "D", "E", "F"), class = "data.frame")
Or if you feel curious about tidyverse:
library(tidyverse)
#Code
df %>% mutate(id=1:n()) %>%
left_join(df %>% mutate(id=1:n()) %>%
pivot_longer(cols = -id) %>%
filter(value %in% c('A','B')) %>%
group_by(id) %>%
summarise(Var=n())) %>% ungroup() %>%
replace(is.na(.),0) %>% select(-id)
Output:
COL1 COL2 COL3 Var
1 A B 0 2
2 B B C 2
3 0 0 0 0
4 B B B 3
5 0 0 C 0
6 A A C 2
library(dplyr)
df <- structure(list(COL1 = c("A", "B", NA, "B", NA, "A"), COL2 = c("B",
"B", NA, "B", NA, "A"), COL3 = c(NA, "C", NA, "B", "C", "C")), row.names = c("A",
"B", "C", "D", "E", "F"), class = "data.frame")
df %>%
rowwise() %>%
mutate(sumVar = across(c(COL1:COL3),~ifelse(. %in% c("A", "B"),1,0)) %>% sum)
# A tibble: 6 x 4
# Rowwise:
COL1 COL2 COL3 sumVar
<chr> <chr> <chr> <dbl>
1 A B NA 2
2 B B C 2
3 NA NA NA 0
4 B B B 3
5 NA NA C 0
6 A A C 2

R - filter row if id column is unique or, when id is duplicated, the row with least NAs

I have a dataframe like this:
set.seed(123)
testdf <- data.frame(id = c(123,124,125,125,126,126,126,127,128,129,130),
var01 = c(sample(c("A", "B", "C", NA), 11, TRUE)),
var02 = c(sample(c("A", "B", "C", NA), 11, TRUE)),
var03 = c(sample(c("A", "B", "C", NA), 11, TRUE)),
var04 = c(sample(c("A", "B", "C", NA), 11, TRUE)),
var05 = c(sample(c("A", "B", "C", NA), 11, TRUE)),
var06 = c(sample(c("A", "B", "C", NA), 11, TRUE)),
var07 = c(sample(c("A", "B", "C", NA), 11, TRUE)),
var08 = c(sample(c("A", "B", "C", NA), 11, TRUE)),
var09 = c(sample(c("A", "B", "C", NA), 11, TRUE)),
var10 = c(sample(c("A", "B", "C", NA), 11, TRUE)))
testdf
id var01 var02 var03 var04 var05 var06 var07 var08 var09 var10
1 123 B B C <NA> A A <NA> C <NA> C
2 124 <NA> C <NA> A A A <NA> B A C
3 125 B C C B A <NA> <NA> A A B
4 125 <NA> A C <NA> B <NA> B A C B
5 126 <NA> <NA> C A B B <NA> C B <NA>
6 126 A A C B <NA> C C B C B
7 126 C A B A A A C <NA> B <NA>
8 127 <NA> B A A B B A A A <NA>
9 128 C <NA> <NA> B <NA> B B B <NA> C
10 129 B <NA> <NA> B A <NA> A <NA> A B
11 130 <NA> C C B C B B <NA> B A
I want to filter rows based on 2 conditions:
1) Rows with a unique id.
2) When ID is duplicated, I want to keep the row with the least NAs in that row.
My desired output exists of all rows except 4, 5 and 7.
You can assume the minimum NAs in a row for each id occurs only once (so 2 for id 125 and 1 for 126 in the example).
I prefer a base R or dplyr solution.
Many thanks in advance.
library(dplyr)
testdf %>%
mutate(NAs = rowSums(is.na(.))) %>%
group_by(id) %>%
filter(NAs == min(NAs)) %>%
select(-NAs) %>%
ungroup
Or
testdf %>%
arrange(id, rowSums(is.na(.))) %>%
group_by(id) %>%
slice(1) %>%
ungroup
I noticed I asked this question years ago, when I was less experienced with R. In case it is ever useful to anyone, this might be the shortest solution:
testdf %>%
arrange(id, rowSums(is.na(.))) %>%
distinct(id, .keep_all = T)

r - count number of identical rows

I hope this is not a duplicate question (did my best to see if it was already asked). I have a data frame and would like to count how many rows are identical.
df = data.frame(ID = c("id1", "id2", "id3", "id4", "id5", "id6", "id7", "id8", "id9"),
Val1 = c("A", "B", "C", "A", "A", "B", "D", "C", "D"),
Val2 = c("B", "C", NA, "B", "B", "D", "E", "D", "E"),
Val3 = c("C", NA, NA, "C", "C", "B", NA, NA,NA),
Val4 = c("D", NA, NA, "E", "D", NA, NA, NA, NA))
> df
ID Val1 Val2 Val3 Val4
1 id1 A B C D
2 id2 B C <NA> <NA>
3 id3 C <NA> <NA> <NA>
4 id4 A B C E
5 id5 A B C D
6 id6 B D B <NA>
7 id7 D E <NA> <NA>
8 id8 C D <NA> <NA>
9 id9 D E <NA> <NA>
So for this example I expect that the return would be A B C D 2, D E 2, B C <NA> <NA> 1 and so on..
Tried with table but I get an Error in table(type_table) : attempt to make a table with >= 2^31 elements and my df has "only" ~140K rows. I want to apply this on a much larger dataset. Tried with summarise also but probably I do not know how to apply it correctly. Is aggregate an option? Thank you
The reason why table isn't working is because it treats each column separately and tries to find by element combinations instead of by row combinations.
You can try using the do.call(paste( combination in order to paste elements by row and run table over it
table(do.call(paste, df[-1]))
# A B C D A B C E B C NA NA B D B NA C D NA NA C NA NA NA D E NA NA
# 2 1 1 1 1 1 2
If table isn't efficient enough, we can try with .N from data.table instead
library(data.table)
setDT(df)[, .N, by = c(names(df)[-1])]
# Val1 Val2 Val3 Val4 N
# 1: A B C D 2
# 2: B C NA NA 1
# 3: C NA NA NA 1
# 4: A B C E 1
# 5: B D B NA 1
# 6: D E NA NA 2
# 7: C D NA NA 1
With data.table
library(data.table)
setDT(df)
df[, dups := 1:.N, setdiff(names(df), "ID")]
df[, .SD[.N], setdiff(names(df), c("ID", "dups"))][dups != 1]
Group by everything except ID, index items within groups of duplicates, then select the last row in each group (when the duplication index isn't 1).

Count numbers of elements in rows considering start column connected to index variable in the same data frame

Hi everybody I am triying to solve a little problem in R. I have the next data frame (I add dput() version in the final part):
ID Index s1 s2 s3 s4 s5 s6 s7 s8 s9 s10
1 a 1 A A A A A A A A A A
2 b 3 <NA> <NA> A A A A A A A A
3 c 5 <NA> <NA> <NA> <NA> A A A A A A
4 d 1 A A A A A A A A A A
5 e 1 A A A A A A A A A A
6 f 6 <NA> <NA> <NA> <NA> <NA> A A A A A
7 g 6 <NA> <NA> <NA> <NA> <NA> A A A A A
8 h 4 <NA> <NA> <NA> A A A A A A A
In my data frame Test, Index show the first non ocurrence of NA in each row. Now I would like to compute for each row in a new variable, the number of A considering columns like this: first column would be the column where is the first non NA occurence, for example in the case of b ID it has Index=3 and I want to count the number of A since this column until 4 columns later, in this case until column s3+4=s7. This for all rows. I would like to get something like this:
ID Index s1 s2 s3 s4 s5 s6 s7 s8 s9 s10 Count
1 a 1 A A A A A A A A A A 5
2 b 3 <NA> <NA> A A A A A A A A 5
3 c 5 <NA> <NA> <NA> <NA> A A A A A A 5
4 d 1 A A A A A A A A A A 5
5 e 1 A A A A A A A A A A 5
6 f 6 <NA> <NA> <NA> <NA> <NA> A A A A A 5
7 g 6 <NA> <NA> <NA> <NA> <NA> A A A A A 5
8 h 4 <NA> <NA> <NA> A A A A A A A 5
I used this code but I got a mistake:
Test$Count=apply(Test[,c(-1,-2,Test$Index.Test$Index+4)] , 1 , function(x) length(which(!is.na(x) & x=="A")))
Please your help is welcome, the dput() version of my data frame is the next:
structure(list(ID = c("a", "b", "c", "d", "e", "f", "g", "h"),
Index = c(1, 3, 5, 1, 1, 6, 6, 4), s1 = c("A", NA, NA, "A",
"A", NA, NA, NA), s2 = c("A", NA, NA, "A", "A", NA, NA, NA
), s3 = c("A", "A", NA, "A", "A", NA, NA, NA), s4 = c("A",
"A", NA, "A", "A", NA, NA, "A"), s5 = c("A", "A", "A", "A",
"A", NA, NA, "A"), s6 = c("A", "A", "A", "A", "A", "A", "A",
"A"), s7 = c("A", "A", "A", "A", "A", "A", "A", "A"), s8 = c("A",
"A", "A", "A", "A", "A", "A", "A"), s9 = c("A", "A", "A",
"A", "A", "A", "A", "A"), s10 = c("A", "A", "A", "A", "A",
"A", "A", "A")), .Names = c("ID", "Index", "s1", "s2", "s3",
"s4", "s5", "s6", "s7", "s8", "s9", "s10"), row.names = c(NA,
8L), class = "data.frame")
Thanks a lot of.
Here's an easier approach:
mcol <- ncol(Test) - 2
Test$Count <- sapply(seq(nrow(Test)), function(x) {
idx <- Test$Index[x]
idx2 <- seq(idx, min(idx + 4, mcol))
sum(Test[x, -(1:2)][, idx2] == "A", na.rm = TRUE)
})
ID Index s1 s2 s3 s4 s5 s6 s7 s8 s9 s10 Count
1 a 1 A A A A A A A A A A 5
2 b 3 <NA> <NA> A A A A A A A A 5
3 c 5 <NA> <NA> <NA> <NA> A A A A A A 5
4 d 1 A A A A A A A A A A 5
5 e 1 A A A A A A A A A A 5
6 f 6 <NA> <NA> <NA> <NA> <NA> A A A A A 5
7 g 6 <NA> <NA> <NA> <NA> <NA> A A A A A 5
8 h 4 <NA> <NA> <NA> A A A A A A A 5
This variation will count As. Don't know if you actually have other possible letters, but if you do, you'll need to do something like this:
cbind(
Test,
Count=apply(
Test[-1], 1,
function(x) sum(x[-1][x[[1]]:(as.numeric(x[[1]]) + 4)] == "A")
) )
# ID Index s1 s2 s3 s4 s5 s6 s7 s8 s9 s10 Count
# 1 a 1 A A A A A A A A A A 5
# 2 b 3 <NA> <NA> A A A A A A A A 5
# 3 c 5 <NA> <NA> <NA> <NA> A A A A A A 5
# 4 d 1 A A A A A A A A A A 5
# 5 e 1 A A A A A A A A A A 5
# 6 f 6 <NA> <NA> <NA> <NA> <NA> A A A A A 5
# 7 g 6 <NA> <NA> <NA> <NA> <NA> A A A A A 5
# 8 h 4 <NA> <NA> <NA> A A A A A A A 5
I think the main issue with your approach is you're trying to mix positive and negative indices for the columns of Test. It's also not entirely clear what Test$Index.Test$Index is inside your apply.

Resources