Transform the dataframe in R - r

Can we transform this data as shown in expected output?
ColA ColB ColC
A - 0
B X 3
C Y 4
D X 51
D Y 32
Expected Output
ColA X Y
A 0 0
B 3 0
C 0 4
D 51 32

A short solution based on data.table (most recommended option if you have voluminous data)
colA = c("A","B","C","D","D")
colB = c(NA,"X","Y","X","Y")
colC = c(0,3,4,51,32)
library(data.table)
dt <- data.table(colA = colA, colB = colB, colC = colC)
dt <- dcast(dt, colA ~ ..., fill = 0)
dt[,"NA" := NULL]
dt
colA X Y
1: A 0 0
2: B 3 0
3: C 0 4
4: D 51 32
dcast makes your data from long to wide format.

Probably, something like this :
library(dplyr)
library(tidyr)
df %>%
mutate_at(vars(ColB:ColC), ~replace(., . == '-', 'X')) %>%
pivot_wider(names_from = ColB, values_from = ColC, values_fill = list(ColC = 0))
# ColA X Y
# <fct> <chr> <chr>
#1 A 0 0
#2 B 3 0
#3 C 0 4
#4 D 51 32
data
df <- structure(list(ColA = structure(c(1L, 2L, 3L, 4L, 4L), .Label = c("A",
"B", "C", "D"), class = "factor"), ColB = structure(c(1L, 2L,
3L, 2L, 3L), .Label = c("-", "X", "Y"), class = "factor"), ColC = c(0L,
3L, 4L, 51L, 32L)), class = "data.frame", row.names = c(NA, -5L))

Yes we can.
Using xtabs:
res <- xtabs(ColC ~ ColA + ColB, dat)[,-1]
# ColB
# ColA X Y
# A 0 0
# B 3 0
# C 0 4
# D 51 32
To get a data frame, do:
res <- as.data.frame(unclass(res))
res
# X Y
# A 0 0
# B 3 0
# C 0 4
# D 51 32
Or, all in one:
res <- as.data.frame(unclass(xtabs(ColC ~ ColA + ColB, dat)[,-1]))
Data
dat <- structure(list(ColA = c("A", "B", "C", "D", "D"), ColB = c("-",
"X", "Y", "X", "Y"), ColC = c(0L, 3L, 4L, 51L, 32L)), row.names = c(NA,
-5L), class = "data.frame")

Related

R dplyr - Filter unique row in each group with dplyr

My data looks like this
id col2 col3 flag val
1 a q
1 a w 1
1 b r
2 c q 1 5
2 c q
2 c q 1 6
I only want these rows
id col2 col3 flag val
1 a q
1 a w 1
1 b r
2 c q 1 5
Basically the first 3 columns determine a group. For each group, if there is only 1 observation/row, then keep that row no matter what value of flag is. If each group has more than 1 observation/row, then keep the first row in that group that has flag equal 1. I wonder whether there is any way to do this in R with dplyr
dplyr::distinct helps with exactly this, and the .keep_all flag keeps the other columns like in your output.
my_data %>%
distinct(id, col2, col3, .keep_all = TRUE)
Result
# A tibble: 4 x 5
id col2 col3 flag val
<int> <chr> <chr> <int> <int>
1 1 a q NA NA
2 1 a w 1 NA
3 1 b r NA NA
4 2 c q 1 5
Data
my_data <- tibble::tribble(
~id, ~col2, ~col3, ~flag, ~val,
1L, "a", "q", NA, NA,
1L, "a", "w", 1L, NA,
1L, "b", "r", NA, NA,
2L, "c", "q", 1L, 5L,
2L, "c", "q", NA, NA,
2L, "c", "q", 1L, 6L
)
dat %>%
mutate(rn = row_number()) %>%
arrange(flag) %>%
group_by(id, col2, col3) %>%
slice(1) %>%
ungroup() %>%
arrange(rn) %>%
select(-rn)
# # A tibble: 4 x 5
# id col2 col3 flag val
# <int> <chr> <chr> <int> <int>
# 1 1 a q NA NA
# 2 1 a w 1 NA
# 3 1 b r NA NA
# 4 2 c q 1 5
If your data is instead strings with empty strings (it's not clear in the question), then
dat %>%
# this is just to transform my number-based 'flag'/'val' to strings, you don't need this
mutate(across(c(flag, val), ~ if_else(is.na(.), "", as.character(.)))) %>%
# pick up here
mutate(rn = row_number()) %>%
arrange(!nzchar(flag)) %>% # this is the only difference from above
group_by(id, col2, col3) %>%
slice(1) %>%
ungroup() %>%
arrange(rn) %>%
select(-rn)
# # A tibble: 4 x 5
# id col2 col3 flag val
# <int> <chr> <chr> <chr> <chr>
# 1 1 a q "" ""
# 2 1 a w "1" ""
# 3 1 b r "" ""
# 4 2 c q "1" "5"
The use of rn is merely to ensure that the order is preserved across the filtering. If order is not an issue (perhaps it's inferred some other way), then you can remove the mutate, and the trailing arrange(rn) %>% select(-rn).
Data
dat <- structure(list(id = c(1L, 1L, 1L, 2L, 2L, 2L), col2 = c("a", "a", "b", "c", "c", "c"), col3 = c("q", "w", "r", "q", "q", "q"), flag = c(NA, 1L, NA, 1L, NA, 1L), val = c(NA, NA, NA, 5L, NA, 6L)), class = "data.frame", row.names = c(NA, -6L))
You can select a row when :
there is only one row in the group OR
the 1st row where flag = 1 in the group.
library(dplyr)
df %>%
group_by(id, col2, col3) %>%
filter(n() == 1 | row_number() == match(1, flag)) %>%
ungroup()
# id col2 col3 flag val
# <int> <chr> <chr> <int> <int>
#1 1 a q NA NA
#2 1 a w 1 NA
#3 1 b r NA NA
#4 2 c q 1 5
data
df <- structure(list(id = c(1L, 1L, 1L, 2L, 2L, 2L), col2 = c("a",
"a", "b", "c", "c", "c"), col3 = c("q", "w", "r", "q", "q", "q"
), flag = c(NA, 1L, NA, 1L, NA, 1L), val = c(NA, NA, NA, 5L,
NA, 6L)), class = "data.frame", row.names = c(NA, -6L))

How to switch values in a column?

I have the following column:
ColA ColB
1 f
2 f
1 f
1 f
2 f
How do I switch the 1's to 2's and 2's to 1's in ColA?
If there are only 1 and 2 values, we can make use of arithmetic to change the values
df1$ColA <- with(df1, 2- ColA + 1)
df1$ColA
#[1] 2 1 2 2 1
Or using match
match(df1$ColA, c(2, 1))
Or using factor
factor(df1$ColA, levels = 1:2, labels = 2:1)
data
df1 <- structure(list(ColA = c(1L, 2L, 1L, 1L, 2L), ColB = c("f", "f",
"f", "f", "f")), class = "data.frame", row.names = c(NA, -5L))
Here is another way to solve this using transform from base r with an ifelse.
# Data: (h/t #akrun)
df1 <- structure(list(ColA = c(1L, 2L, 1L, 1L, 2L), ColB = c("f", "f",
"f", "f", "f")), class = "data.frame", row.names = c(NA, -5L))
# Code:
transform(df1, ColA = ifelse(ColA == 1, 2, 1))
# Output:
#> ColA ColB
#> 1 2 f
#> 2 1 f
#> 3 2 f
#> 4 2 f
#> 5 1 f
Created on 2020-11-21 by the reprex package (v0.3.0)

How to do a binary if statement in R?

I have a data set with a column of letters, followed by another column of ones and zeroes. I want to total the amount of "ones" for each letter, but am unsure how to do so in an effective way.
I appreciate the help.
We can group by the first column ('col1') and then get the sum of 'col2'
library(dplyr)
df1 %>%
group_by(col1) %>%
summarise(Total = sum(col2))
Or in data.table
library(data.table)
setDT(df1)[, .(Total = sum(col2)), col1]
Or with base R
rowsum(df1$col2, df1$col1)
Here are some other base R solutions
> tapply(df$col2, df$col1, sum)
a b c
1 1 2
> xtabs(col2~col1,df)
col1
a b c
1 1 2
Dummy Data
df <- structure(list(col1 = structure(c(1L, 3L, 1L, 2L, 1L, 3L, 3L,
2L, 2L, 3L), .Label = c("a", "b", "c"), class = "factor"), col2 = c(0,
0, 0, 0, 1, 1, 1, 1, 0, 0)), class = "data.frame", row.names = c(NA,
-10L))
> df
col1 col2
1 a 0
2 c 0
3 a 0
4 b 0
5 a 1
6 c 1
7 c 1
8 b 1
9 b 0
10 c 0

R increment by 1 for every change in value column and restart the counter

I would like to find a way to do very similar to this question.
Increment by 1 for every change in column
But i want to restart the counter when var1 = c
using
df$var2 <- with(rle(as.character(df$var1)), rep(seq_along(values), lengths))*
results in column var 2
var1 var2 Should be
a 1 1
a 1 1
1 2 2
0 3 3
b 4 4
b 4 4
b 4 4
c 5 1
1 6 2
1 6 2
In data.table you can use rleid to get a run-length-id for var1 within each group.
library(data.table)
setDT(df)
df[, var2 := rleid(var1), by = cumsum(var1 == "c")]
df
# var1 var2
# 1: a 1
# 2: a 1
# 3: 1 2
# 4: 0 3
# 5: b 4
# 6: b 4
# 7: b 4
# 8: c 1
# 9: 1 2
#10: 1 2
and using dplyr
library(dplyr)
df %>%
group_by(group = cumsum(var1 == "c")) %>%
mutate(var2 = cumsum(var1 != lag(var1, default = first(var1))) + 1)
data
df <- structure(list(var1 = structure(c(3L, 3L, 2L, 1L, 4L, 4L, 4L,
5L, 2L, 2L), .Label = c("0", "1", "a", "b", "c"), class = "factor")),
class = "data.frame", row.names = c(NA, -10L))
We can use the OP's code with rle in base R with ave
df$var2 <- with(df, as.integer(ave(as.character(var1), cumsum(var1 == 'c'),
FUN = function(x) with(rle(x), rep(seq_along(values), lengths)))))
df$var2
#[1] 1 1 2 3 4 4 4 1 2 2
data
df <- structure(list(var1 = structure(c(3L, 3L, 2L, 1L, 4L, 4L, 4L,
5L, 2L, 2L), .Label = c("0", "1", "a", "b", "c"), class = "factor")),
class = "data.frame", row.names = c(NA,
-10L))

How to group by a Variable A to get a minimum value from Variable B based on a condition on Variable C

I have a data of 3 variables A, B, C. I need to group by on A and need a minimum value of B when C is non-zero.
> data
A B C
1 a 3 0
2 a 6 1
3 a 9 2
4 a 12 2
5 b 3 0
6 b 6 0
7 b 9 0
8 b 12 4
Expected Output:
> output
1 2
1 a 6
2 b 12
I tried doing this which was running for more than 2 hours:
rbind(by(data, data$A, function(x) min(x$B[x$C>0])))
We group by 'A', get the min of 'B' where 'C' is not 0
library(dplyr)
df1 %>%
group_by(A) %>%
summarise(B = min(B[C > 0]))
# A tibble: 2 x 2
# A B
# <chr> <int>
#1 a 6
#2 b 12
Or a faster option would be to filter first, then do the group_by
df1 %>%
filter(C > 0) %>%
group_by(A) %>%
summarise(B = min(B))
Or with data.table
library(data.table)
setDT(df1)[,.(B = min(B[C > 0])) , A]
data
df1 <- structure(list(A = c("a", "a", "a", "a", "b", "b", "b", "b"),
B = c(3L, 6L, 9L, 12L, 3L, 6L, 9L, 12L), C = c(0L, 1L, 2L,
2L, 0L, 0L, 0L, 4L)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8"))

Resources