Add a new column in R with presence/absence info - r

Hel lo I have a dataframe such as :
Col1 Col2
A 23
B NA
C 21
D 2
E NA
F 9
and I would like to add a new Col3 with presence/absence info (1/0)
If the number in col2 >=1 I put 1
If NA I put 0
and get :
Col1 Col2 Col3
A 23 1
B NA 0
C 21 1
D 2 1
E NA 0
F 9 1

You could assign Col3 as 1 if col2 is greater than equal to 1 and is not NA.
df$Col3 <- +(df$Col2 >= 1 & !is.na(df$Col2))
df
# Col1 Col2 Col3
#1 A 23 1
#2 B NA 0
#3 C 21 1
#4 D 2 1
#5 E NA 0
#6 F 9 1
+ at the beginning converts logical values TRUE/FALSE to integer values 1/0.
data
df <- structure(list(Col1 = structure(1:6, .Label = c("A", "B", "C",
"D", "E", "F"), class = "factor"), Col2 = c(23L, NA, 21L, 2L,
NA, 9L)), class = "data.frame", row.names = c(NA, -6L))

Another tidy way might be
library(dplyr)
mutate(df,
Col3 = ifelse(Col2 %in% NA,0,1)
)

We can use dplyr
library(dplyr)
df %>%
mutate(Col3 = as.integer(Col2 >=1 & !is.na(Col2)))
# Col1 Col2 Col3
#1 A 23 1
#2 B NA 0
#3 C 21 1
#4 D 2 1
#5 E NA 0
#6 F 9 1
data
df <- structure(list(Col1 = structure(1:6, .Label = c("A", "B", "C",
"D", "E", "F"), class = "factor"), Col2 = c(23L, NA, 21L, 2L,
NA, 9L)), class = "data.frame", row.names = c(NA, -6L))

Related

How to use or should I use case_when to change values when using the list of variables

I believe what Id like to do is relatively simple, I just don't seem to know the proper terminology to get the answer to my question. I have a data frame with 9 variables. I want to create a new variable that is based on variables from another column. Simple example:
my.df <- data.frame(col1 = sample(c(1,2), 10, replace = TRUE),
col2 = as.factor(sample(10)), col3 = letters[1:10],
col4 = sample(c(TRUE, FALSE), 10, replace = TRUE))
col1 col2 col3 col4
1 2 8 a TRUE
2 1 3 b FALSE
3 2 4 c FALSE
4 2 2 d TRUE
5 2 7 e FALSE
6 2 9 f TRUE
7 2 10 g FALSE
8 2 6 h FALSE
9 1 1 i FALSE
10 2 5 j FALSE
I would like to create col5 by using information from col3. I am expecting something like this:
my.df<-my.df %>%
mutate(col5 = case_when(col3 = c("a", "b", "c") ~"green",
col3 = c("g", "h", "i", "j")~"red",
col3 = c("d", "e", "f")~"purple"))
I am expecting results like this:
col1 col2 col3 col4 col5
1 2 8 a TRUE green
2 1 3 b FALSE green
3 2 4 c FALSE green
4 2 2 d TRUE purple
5 2 7 e FALSE purple
6 2 9 f TRUE purple
7 2 10 g FALSE red
8 2 6 h FALSE red
9 1 1 i FALSE red
10 2 5 j FALSE red
Error is must be a logical vector, not a character vector.
If I change (col3 == c("")... using the == I get warning messages and problem that longer object length is not a multiple of shorter object length.
My solution eventually was to create a vector of just the names and then use %in%. However, I really think there should be a simple way to do this? OR maybe using different commands where I don't have to change values row by row.
Example of what I did get to work, which I had to do for each color:
grn<-c("a", "b", "c")
my.df<-my.df %>%
mutate(col5 = case_when(col3 %in% grn~"green")
You can use %in% to compare multiple values -
library(dplyr)
my.df %>%
mutate(col5 = case_when(col3 %in% c("a", "b", "c") ~"green",
col3 %in% c("g", "h", "i", "j")~"red",
col3 %in% c("d", "e", "f")~"purple"))
Perhaps this helps
library(dplyr)
library(stringr)
my.df %>%
group_by(grp = cumsum(col4)) %>%
mutate(col5 = setNames(c('green', 'red', 'purple'),
c('abc', 'ghij', 'def'))[str_c(col3, collapse='')]) %>%
ungroup %>%
select(-grp)
-output
# A tibble: 10 x 5
col1 col2 col3 col4 col5
<int> <int> <chr> <lgl> <chr>
1 2 8 a TRUE green
2 1 3 b FALSE green
3 2 4 c FALSE green
4 2 2 d TRUE purple
5 2 7 e FALSE purple
6 2 9 f FALSE purple
7 2 10 g TRUE red
8 2 6 h FALSE red
9 1 1 i FALSE red
10 2 5 j FALSE red
data
my.df <- structure(list(col1 = c(2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L
), col2 = c(8L, 3L, 4L, 2L, 7L, 9L, 10L, 6L, 1L, 5L), col3 = c("a",
"b", "c", "d", "e", "f", "g", "h", "i", "j"), col4 = c(TRUE,
FALSE, FALSE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE)),
row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10"), class = "data.frame")

Change value in one column if a value in another is in a list [R]

Hello I have a df such as :
COL1 COL2
A 1
B 2
C 3
D 4
E 5
F 6
List<-c("A","C")
and if a COL1 value is in List, then add "OK" into the COL2
I should then get:
COL1 COL2
A OK
B 2
C OK
D 4
E 5
F 6
Here are the data
structure(list(COL1 = structure(1:6, .Label = c("A", "B", "C",
"D", "E", "F"), class = "factor"), COL2 = 1:6), class = "data.frame", row.names = c(NA,
-6L))
You can use %in% + replace like below
transform(
df,
COL2 = replace(COL2, COL1 %in% List, "OK")
)
which gives
COL1 COL2
1 A OK
2 B 2
3 C OK
4 D 4
5 E 5
6 F 6
A dplyr option
> df %>%
+ mutate_at("COL2", ~ replace(., COL1 %in% List, "OK"))
COL1 COL2
1 A OK
2 B 2
3 C OK
4 D 4
5 E 5
6 F 6

Check if one row is equal to any other rows in R

I have a dataset with one ID column, 12 information columns (strings) and n rows. It looks like this:
ID Col1 Col2 Col3 Col4 Col5 ...
01 a b c d a
02 a a a a a
03 b b b b b
...
I need to go row by row and check if that row (considering all of it's columns) is equal to any other row in the dataset. My output needs to be two new columns: one indicating if that particular row is equal to any other row and a second column indicating which row it is equal to (in case of TRUE in the previous column)
I appreciate any suggestions.
Assuming DF in the Note at the end, sort it and create a column dup indicating whether there exists a prior duplicate row. Then set to wx to the row number in the original data frame of the duplicate. Finaly resort back.
We have assumed that duplicate means that the columns other than the ID are the same but that is readily changed if need be. We have also assumed that we should mark the second and subsequent rows among duplicates whereas the first is not so marked becaue it has to that point no duplicate.
The question does not address the situation of more than 2 identical rows but if that situation exists then each duplicate will point to the nearest prior row of which it is a duplicate.
o <- do.call("order", DF[-1])
DFo <- DF[o, ]
DFo$wx <- DFo$dup <- duplicated(DFo)
DFo$wx[DFo$dup] <- as.numeric(rownames(DFo))[which(DFo$dup) - 1]
DFo[order(o), ] # back to original order
giving:
ID Col1 Col2 Col3 Col4 Col5 dup wx
1 1 a b c d a FALSE 0
2 2 a a a a a FALSE 0
3 3 b b b b b FALSE 0
4 1 a b c d a TRUE 1
Note
Lines <- "ID Col1 Col2 Col3 Col4 Col5
01 a b c d a
02 a a a a a
03 b b b b b"
DF <- read.table(text = Lines, header = TRUE)
DF <- DF[c(1:3, 1), ]
rownames(DF) <- NULL
giving:
> DF
ID Col1 Col2 Col3 Col4 Col5
1 1 a b c d a
2 2 a a a a a
3 3 b b b b b
4 1 a b c d a
With a df like below:
ID Col1 Col2 Col3 Col4 Col5
1 1 a b c d a
2 2 a a a a a
3 3 b b b b b
4 3 b b b b b
You could try grouping by all columns and checking whether any count > 1 as well as pasting together row numbers (1:nrow(df)):
df <- transform(
df,
dupe = ave(ID, mget(names(df)), FUN = length) > 1,
dupeRows = ave(1:nrow(df), mget(names(df)), FUN = toString)
)
As this would get you a number for each row, even when there are no duplicates, you could do:
df$dupeRows <- with(df,
Map(function(x, y)
toString(x[x != y]),
strsplit(as.character(dupeRows), split = ', '),
1:nrow(df)))
Output:
ID Col1 Col2 Col3 Col4 Col5 dupe dupeRows
1 1 a b c d a FALSE
2 2 a a a a a FALSE
3 3 b b b b b TRUE 4
4 3 b b b b b TRUE 3
Data
df <- structure(list(ID = c(1L, 2L, 3L, 3L), Col1 = structure(c(1L,
1L, 2L, 2L), .Label = c("a", "b"), class = "factor"), Col2 = structure(c(2L,
1L, 2L, 2L), .Label = c("a", "b"), class = "factor"), Col3 = structure(c(3L,
1L, 2L, 2L), .Label = c("a", "b", "c"), class = "factor"), Col4 = structure(c(3L,
1L, 2L, 2L), .Label = c("a", "b", "d"), class = "factor"), Col5 = structure(c(1L,
1L, 2L, 2L), .Label = c("a", "b"), class = "factor")), row.names = c(NA,
-4L), class = "data.frame")
A dplyr solution
library(dplyr)
df %>%
mutate(row_num = 1:n(), is_dup = duplicated(df)) %>%
group_by(across(-c(row_num, is_dup))) %>%
mutate(
has_copies = n() > 1L,
which_row = if_else(is_dup, first(row_num), NA_integer_),
row_num = NULL, is_dup = NULL
)
Output
# A tibble: 5 x 8
# Groups: ID, Col1, Col2, Col3, Col4, Col5 [3]
ID Col1 Col2 Col3 Col4 Col5 has_copies which_row
<chr> <fct> <fct> <fct> <fct> <fct> <lgl> <int>
1 1 a b c d a FALSE NA
2 2 a a a a a FALSE NA
3 3 b b b b b TRUE NA
4 3 b b b b b TRUE 3
5 3 b b b b b TRUE 3
For each row that has more than one copies, the has_copies gives a TRUE.
For a set of rows that are the same, I consider the first one as the original and all other rows as duplicates. In this regard, which_row gives you the index of the original for each duplicate it found. In other words, If a row has no duplicate or is the original, it gives you NA.

How to transport columns from one matrix to another matrix according to first column values

I have the following matrix:
1 a d
2 s c
4 d 0
7 f t
I want to have the following:
1 a d
2 s c
3 0 0
4 d 0
5 0 0
6 0 0
7 f t
Moreover, I would like it to be done in a way where I would not have to specify each column...
Thank you,
G
Or use merge
df2 <- merge(data.frame(V1 = seq_len(max(df[, 1]))), df, by = "V1", all.x = TRUE)
df2[is.na(df2)] <- 0
# V1 V2 V3
# 1 1 a d
# 2 2 s c
# 3 3 0 0
# 4 4 d 0
# 5 5 0 0
# 6 6 0 0
# 7 7 f t
Where df is
df <- structure(list(V1 = c(1L, 2L, 4L, 7L), V2 = c("a", "s", "d",
"f"), V3 = c("d", "c", "0", "t")), .Names = c("V1", "V2", "V3"
), class = "data.frame", row.names = c(NA, -4L))
If dat is data.frame (It is better to store mixed class columns in data.frame than in matrix)
dat2 <- as.data.frame(matrix(0, ncol=ncol(dat), nrow=max(dat$V1)))
dat2$V1 <- 1:nrow(dat2)
dat2[dat2$V1 %in% dat$V1,-1] <- unlist(dat[,-1])
dat2
# V1 V2 V3
#1 1 a d
#2 2 s c
#3 3 0 0
#4 4 d 0
#5 5 0 0
#6 6 0 0
#7 7 f t
Or you could do
dat1 <- transform(dat[rep(1:nrow(dat),c(1,diff(dat$V1))),], V1=seq_along(V1))
dat1[duplicated(dat1[,-1], fromLast=TRUE),-1] <- 0
data
dat <- structure(list(V1 = c(1L, 2L, 4L, 7L), V2 = c("a", "s", "d",
"f"), V3 = c("d", "c", "0", "t")), .Names = c("V1", "V2", "V3"
), class = "data.frame", row.names = c(NA, -4L))

R: Subsetting a data.table with repeated column names with numerical positions

I have a data.table that looks like this
> dput(DT)
A B C A B C D
1: 1 2 3 3 5 6 7
2: 2 1 3 2 1 3 4
Here's the dput
DT <- structure(list(A = 1:2, B = c(2L, 1L), C = c(3L, 3L), A = c(3L,
2L), B = c(5L, 1L), C = c(6L, 3L), D = c(7L, 4L)), .Names = c("A",
"B", "C", "A", "B", "C", "D"), row.names = c(NA, -2L), class = c("data.table",
"data.frame"))
Basically, I want to subset them according to their headers. So for header "B", I would do this:
subset(DT,,grep(unique(names(DT))[2],names(DT)))
B B
1: 2 2
2: 1 1
As you can see, the values are wrong as the second column is simply a repeat of the first. I want to get this instead:
B B
1: 2 5
2: 1 1
Can anyone help me please?
The following alternatives work for me:
pos <- grep("B", names(DT))
DT[, ..pos]
# B B
# 1: 2 5
# 2: 1 1
DT[, .SD, .SDcols = patterns("B")]
# B B
# 1: 2 5
# 2: 1 1
DT[, names(DT) %in% unique(names(DT))[2], with = FALSE]
# B B
# 1: 2 5
# 2: 1 1

Resources