replacing multiple values in data frame in R - r

I want to reassign multiple different character strings with the same value in a single call. However, the following code only replaces some of values in each variable.
dat <-data.frame(x=c(rep("1=x",4),rep("b",4)),y=c(rep("1=z",4),rep("b",4)))
dat[] <- sapply(dat[], as.character)
dat[dat == c("1=x", "1=y")]<- 1
such that I get:
dat
x y
1 1 1=z
2 1=x 1=z
3 1 1=z
4 1=x 1=z
5 b b
6 b b
7 b b
8 b b
when I want is the following:
dat
x y
1 1 1
2 1 1
3 1 1
4 1 1
5 b b
6 b b
7 b b
8 b b

With dplyr:
library(dplyr)
dat <- mutate_all(dat, funs(replace(., grepl("1=", .), 1)))
With Base R:
dat[] <- lapply(dat, function(x) replace(x, grepl("1=", x), 1))
Result:
x y
1 1 1
2 1 1
3 1 1
4 1 1
5 b b
6 b b
7 b b
8 b b
Data:
dat <- structure(list(x = c("1=x", "1=x", "1=x", "1=x", "b", "b", "b",
"b"), y = c("1=z", "1=z", "1=z", "1=z", "b", "b", "b", "b")), .Names = c("x",
"y"), row.names = c(NA, -8L), class = "data.frame")

Another Base R option if you want to make an explicit replacement of certain strings would be:
dat[] <- lapply(dat,function(x) ifelse(x %in% c("1=x", "1=z"), 1, x))
Result:
x y
1 1 1
2 1 1
3 1 1
4 1 1
5 b b
6 b b
7 b b
8 b b
Data:
dat <- structure(list(x = c("1", "1", "1", "1", "b", "b", "b", "b"),
y = c("1", "1", "1", "1", "b", "b", "b", "b")), row.names = c(NA,
-8L), class = "data.frame")

Related

Assign a value to a column in R based on a percentage within each group

[]
1I need to create column C in a data frame where 30% of the rows within each group (column B) get a value 0.
How do I do this in R?
We may use rbinom after grouping by 'category' column. Specify the prob as a vector of values
library(dplyr)
df1 %>%
group_by(category) %>%
mutate(value = rbinom(n(), 1, c(0.7, 0.3))) %>%
ungroup
-output
# A tibble: 9 x 3
sno category value
<int> <chr> <int>
1 1 A 1
2 2 A 0
3 3 A 1
4 4 B 1
5 5 B 0
6 6 B 1
7 7 C 1
8 8 C 0
9 9 C 0
data
df1 <- structure(list(sno = 1:9, category = c("A", "A", "A", "B", "B",
"B", "C", "C", "C")), class = "data.frame", row.names = c(NA,
-9L))
If your data already exist (assuming this is a simplified answer), and if you want the value to be randomly assigned to each group:
library(dplyr)
d <- data.frame(sno = 1:9,
category = rep(c("A", "B", "C"), each = 3))
d %>%
group_by(category) %>%
mutate(value = sample(c(rep(1, floor(n()*.7)), rep(0, n() - floor(n()*.7)))))
Base R
set.seed(42)
d$value <- ave(
rep(0, nrow(d)), d$category,
FUN = function(z) sample(0:1, size = length(z), prob = c(0.3, 0.7), replace = TRUE)
)
d
# sno category value
# 1 1 A 0
# 2 2 A 0
# 3 3 A 1
# 4 4 B 0
# 5 5 B 1
# 6 6 B 1
# 7 7 C 0
# 8 8 C 1
# 9 9 C 1
Data copied from Brigadeiro's answer:
d <- structure(list(sno = 1:9, category = c("A", "A", "A", "B", "B", "B", "C", "C", "C")), class = "data.frame", row.names = c(NA, -9L))

Creating a new column of 1s and 0s as a way to count unique values in R

I would like to add a helper column of 0s and 1s to keep track of unique values based on one or more variables in R Programming.
Sample data:
df<- matrix(c("A","A","A","B","B","C","D","D","D","D"))
and what I would like is:
structure(c("A", "A", "A", "B", "B", "C", "D", "D", "D", "D",
"1", "0", "0", "1", "0", "1", "1", "0", "0", "0"), .Dim = c(10L,
2L))
I think you could use the following solution:
df <- as.data.frame(df)
df$Helper <- +!duplicated(df$V1)
df
V1 Helper
1 A 1
2 A 0
3 A 0
4 B 1
5 B 0
6 C 1
7 D 1
8 D 0
9 D 0
10 D 0
Using dplyr
library(dplyr)
library(data.table)
df %>%
mutate(Helper = +(rowid(id) == 1))
data
df <- structure(list(id = c("A", "A", "A", "B", "B", "C", "D", "D",
"D", "D")), class = "data.frame", row.names = c(NA, -10L))
Another base R option using ave
transform(
as.data.frame(df),
helper = +(ave(seq_along(V1),V1,FUN = seq_along)==1)
)
gives
V1 helper
1 A 1
2 A 0
3 A 0
4 B 1
5 B 0
6 C 1
7 D 1
8 D 0
9 D 0
10 D 0
A dplyr solution:
# Creating the dataframe:
df <- data.frame(id=c("A","A","A","B","B","C","D","D","D","D"))
library(dplyr)
df %>% group_by(id) %>% mutate(helper = ifelse(row_number()==1, 1,0))
# A tibble: 10 x 2
# Groups: id [4]
id helper
<chr> <dbl>
1 A 1
2 A 0
3 A 0
4 B 1
5 B 0
6 C 1
7 D 1
8 D 0
9 D 0
10 D 0
Here is another option using match -
library(dplyr)
df %>% mutate(result = as.integer(row_number() %in% match(unique(id), id)))
# id result
#1 A 1
#2 A 0
#3 A 0
#4 B 1
#5 B 0
#6 C 1
#7 D 1
#8 D 0
#9 D 0
#10 D 0
In base R -
transform(df, result = as.integer(seq(nrow(df)) %in% match(unique(id), id)))

Wide to long, combining columns in pairs but keeping ID column - R

I have a dataframe of the following type
ID case1 case2 case3 case4
1 A B C D
2 B A
3 E F
4 G C A
5 T
I need to change its format, to a long shape, similar as the below:
ID col1 col2
1 A B
1 A C
1 A D
1 B C
1 B D
1 C D
2 B A
3 E F
4 G C
4 G A
4 C A
5 T
As you can see, I need to maintain the ID and ignore empty columns. There are some cases like T that need to remain in the dataset, but without a col2.
I am honestly not sure how to approach this, so that is why there are no examples of what I have tried.
You can get the data in long format and create all combination of values for each ID if the number of rows is greater than 1 in that ID.
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -ID, values_drop_na = TRUE) %>%
group_by(ID) %>%
summarise(value = if(n() > 1) list(setNames(as.data.frame(t(combn(value, 2))),
c('col1', 'col2')))
else list(data.frame(col1 = value[1], col2 = NA_character_))) %>%
unnest(value)
# A tibble: 12 x 3
# ID col1 col2
# <int> <chr> <chr>
# 1 1 A B
# 2 1 A C
# 3 1 A D
# 4 1 B C
# 5 1 B D
# 6 1 C D
# 7 2 B A
# 8 3 E F
# 9 4 G C
#10 4 G A
#11 4 C A
#12 5 T NA
data
df <- structure(list(ID = 1:5, case1 = c("A", "B", "E", "G", "T"),
case2 = c("B", "A", "F", "C", NA), case3 = c("C", NA, NA,
"A", NA), case4 = c("D", NA, NA, NA, NA)),
class = "data.frame", row.names = c(NA, -5L))

arrange a complicated data set in R

I have a large data set:
> ncol(d) [1] 1680 nrow(d) [1] 12
that it looks like this:
a b c e f g
3 2 5 1 3 6
a b c d e g
1 7 8 4 5 8
a c d e f h #in this row b does not exist
5 10 4 7 5 10
And I need that it looks like this:
a b c d e f g h
3 2 5 0 3 6 10 8
1 7 8 4 5 0 8 0
5 0 10 4 7 5 0 10 #and all the other columns ...
Since my data is really long and I have many corrections like this one to do over all the data set, it is hard to do it by hand. I would like to know if there is any way to do this using some sort of automatic way, like a logic function or a loop.
Any idea is welcome
Regards
Here's a possible approach using data.table:
library(data.table)
melt(
setDT(
setnames(
data.table::transpose(df1),
paste(rep(1:(nrow(df1)/2), each = 2), c("name", "value"), sep = "_"))),
measure = patterns("name", "value"))[
, dcast(.SD, variable ~ value1, value.var = "value2", fill = 0)]
# variable a b c d e f g h
# 1: 1 3 2 5 0 1 3 6 0
# 2: 2 1 7 8 4 5 0 8 0
# 3: 3 5 0 10 4 7 5 0 10
We could get the alternate rows with recycling logical vector, construct a data.frame and pivot it to wide format with pivot_wider
library(dplyr)
library(tidyr)
library(data.table)
sub1 <- df1[c(TRUE, FALSE),]
sub2 <- df1[c(FALSE, TRUE),]
tibble(ind = c(row(sub1)), col1 = factor(unlist(sub1), levels = letters[1:8]),
col2 = as.integer(unlist(sub2))) %>%
pivot_wider(names_from = col1, values_from = col2,
values_fill = list(col2 = 0)) %>%
select(-ind)
#A tibble: 3 x 8
# a b c d e f g h
# <int> <int> <int> <int> <int> <int> <int> <int>
#1 3 2 5 0 1 3 6 0
#2 1 7 8 4 5 0 8 0
#3 5 0 10 4 7 5 0 10
Or using base R with reshape
out <- reshape(
data.frame(ind = c(row(sub1)),
col1 = factor(unlist(sub1), levels = letters[1:8]),
col2 = as.integer(unlist(sub2))),
idvar = 'ind', direction = 'wide', timevar = 'col1')[-1]
names(out) <- sub("col2\\.", "", names(out))
out[is.na(out)] <- 0
row.names(out) <- NULL
out
# a b c d e f g h
#1 3 2 5 0 1 3 6 0
#2 1 7 8 4 5 0 8 0
#3 5 0 10 4 7 5 0 10
data
df1 <- structure(list(v1 = c("a", "3", "a", "1", "a", "5"), v2 = c("b",
"2", "b", "7", "c", "10"), v3 = c("c", "5", "c", "8", "d", "4"
), v4 = c("e", "1", "d", "4", "e", "7"), v5 = c("f", "3", "e",
"5", "f", "5"), v6 = c("g", "6", "g", "8", "h", "10")), class = "data.frame",
row.names = c(NA,
-6L))

how can I regroup multiple categorical variables into a new variable

I have a data.frame (df) with 2 columns (A, B):
A B
1 a TCRB
2 a TCRG
3 a TCRB
4 b TCRB
5 b TCRG
6 c TCRB
7 c TCRB
8 c TCRB
9 c TCRB
10 d TCRG
11 d TCRG
12 d TCRG
I want to create a new column "C" as bellow that tells me whether each unique variable in "A" has both TCRB and TCRG or either one of them (0= TCRB only, 1= TCRG only, 2= both) as follows:
A: a b c d
C: 2 2 0 1
Greatly appreciate any help!
Here's an approach with dplyr:
library(dplyr)
df %>%
group_by(A) %>%
dplyr::summarise(C = case_when("TCRB" %in% B & "TCRG" %in% B ~ 2,
"TCRB" %in% B ~ 0,
"TCRG" %in% B ~ 1,
TRUE ~ NA_real_))
# A tibble: 4 x 2
A C
<fct> <dbl>
1 a 2
2 b 2
3 c 0
4 d 1
An option with n_distinct
library(dplyr)
df %>%
group_by(A) %>%
summarise(C = n_distinct(B) *!all(B == 'TCRB'))
# A tibble: 4 x 2
# A C
# <chr> <int>
#1 a 2
#2 b 2
#3 c 0
#4 d 1
data
df <- structure(list(A = c("a", "a", "a", "b", "b", "c", "c", "c",
"c", "d", "d", "d"), B = c("TCRB", "TCRG", "TCRB", "TCRB", "TCRG",
"TCRB", "TCRB", "TCRB", "TCRB", "TCRG", "TCRG", "TCRG")),
class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12"))
In Base R, we can use aggregate :
aggregate(B~A, df, function(x) {
if(all(c('TCRB', 'TCRG') %in% x)) 2
else if(any(x == 'TCRG')) 1
else if(any(x == 'TCRB')) 0
else NA
})
# A B
#1 a 2
#2 b 2
#3 c 0
#4 d 1

Resources