R Converting Wide Data to Long - r

How can I convert my data from this:
example <- data.frame(RTD_1_LOC = c('A', 'B'), RTD_2_LOC = c('C', 'D'),
RTD_3_LOC = c('E', 'F'), RTD_4_LOC = c('G', 'H'),
RTD_5_LOC = c('I', 'J'),RTD_1_OFF = c('1', '2'), RTD_2_OFF = c('3', '4'),
RTD_3_OFF = c('5', '6'), RTD_4_OFF = c('7', '8'),
RTD_5_OFF = c('9', '10'))
to this:
example2 <- data.frame(RTD = c(1,1,2,2,3,3,4,4,5,5),LOC = c('A', 'B','C','D','E','F','G','H','I','J'),
OFF = c(1,2,3,4,5,6,7,8,9,10))
I have been using tidyverse gather, but I end up with about 50 columns
ex <- gather(example,RTD, Location, RTD_1_LOC:RTD_5_LOC)
ex$RTD <- sub('_LOC',"",ex$RTD)
ex3 <- gather(ex,RTD, Offset, RTD_1_OFF:RTD_5_OFF)
ex2$RTD <- sub('_OFF',"",ex2$RTD)

We can use pivot_longer from tidyr and specify the names_pattern to capture the groups from the column names. As the 'RTD' column should be left as such, specify in the names_to, a vector of 'RTD' and the column values (.value) so that the 'RTD' will get the digits capture ((\\d+) and the word ((\\w+)) 'LOC', 'OFF' will be created as new columns with the column values
library(dplyr)
library(tidyr)
example %>%
pivot_longer(cols = everything(),
names_to = c("RTD", ".value"), names_pattern = "\\w+_(\\d+)_(\\w+)")
-output
# A tibble: 10 x 3
RTD LOC OFF
<chr> <chr> <chr>
1 1 A 1
2 2 C 3
3 3 E 5
4 4 G 7
5 5 I 9
6 1 B 2
7 2 D 4
8 3 F 6
9 4 H 8
10 5 J 10

Related

Split df column of integers into individual digits in R

I have a df where one variable is an integer. I'd like to split this column into it's individual digits. See my example below
Group Number
A 456
B 3
C 18
To
Group Number Digit1 Digit2 Digit3
A 456 4 5 6
B 3 3 NA NA
C 18 1 8 NA
We can use read.fwf from base R. Find the max number of character (nchar) in 'Number' column (mx). Read the 'Number' column after converting to character (as.character), specify the 'widths' as 1 by replicating 1 with mx and assign the output to new 'Digit' columns in the data
mx <- max(nchar(df1$Number))
df1[paste0("Digit", seq_len(mx))] <- read.fwf(textConnection(
as.character(df1$Number)), widths = rep(1, mx))
-output
df1
# Group Number Digit1 Digit2 Digit3
#1 A 456 4 5 6
#2 B 3 3 NA NA
#3 C 18 1 8 NA
data
df1 <- structure(list(Group = c("A", "B", "C"), Number = c(456L, 3L,
18L)), class = "data.frame", row.names = c(NA, -3L))
Another base R option (I think #akrun's approach using read.fwf is much simpler)
cbind(
df,
with(
df,
type.convert(
`colnames<-`(do.call(
rbind,
lapply(
strsplit(as.character(Number), ""),
`length<-`, max(nchar(Number))
)
), paste0("Digit", seq(max(nchar(Number))))),
as.is = TRUE
)
)
)
which gives
Group Number Digit1 Digit2 Digit3
1 A 456 4 5 6
2 B 3 3 NA NA
3 C 18 1 8 NA
Using splitstackshape::cSplit
splitstackshape::cSplit(df, 'Number', sep = '', stripWhite = FALSE, drop = FALSE)
# Group Number Number_1 Number_2 Number_3
#1: A 456 4 5 6
#2: B 3 3 NA NA
#3: C 18 1 8 NA
Updated
I realized I could use max function for counting characters limit in each row so that I could include it in my map2 function and save some lines of codes thanks to an accident that led to an inspiration by dear #ThomasIsCoding.
library(dplyr)
library(tidyr)
library(purrr)
library(stringr)
df %>%
rowwise() %>%
mutate(map2_dfc(Number, 1:max(nchar(Number)), ~ str_sub(.x, .y, .y))) %>%
unnest(cols = !c(Group, Number)) %>%
rename_with(~ str_replace(., "\\.\\.\\.", "Digit"), .cols = !c(Group, Number)) %>%
mutate(across(!c(Group, Number), as.numeric, na.rm = TRUE))
# A tibble: 3 x 5
Group Number Digit1 Digit2 Digit3
<chr> <dbl> <dbl> <dbl> <dbl>
1 A 456 4 5 6
2 B 3 3 NA NA
3 C 18 1 8 NA
Data
df <- tribble(
~Group, ~Number,
"A", 456,
"B", 3,
"C", 18
)
Two base r methods:
no_cols <- max(nchar(as.character(df1$Number)))
# Using `strsplit()`:
cbind(df1, setNames(data.frame(do.call(rbind,
lapply(strsplit(as.character(df1$Number), ""),
function(x) {
length(x) <- no_cols
x
}
)
)
), paste0("Digit", seq_len(no_cols))))
# Using `regmatches()` and `gregexpr()`:
cbind(df1, setNames(data.frame(do.call(rbind,
lapply(regmatches(df1$Number, gregexpr("\\d", df1$Number)),
function(x) {
length(x) <- no_cols
x
}
)
)
), paste0("Digit", seq_len(no_cols))))

Replacing value depending on paired column

I have a dataframe with two columns per sample (n > 1000 samples):
df <- data.frame(
"sample1.a" = 1:5, "sample1.b" = 2,
"sample2.a" = 2:6, "sample2.b" = c(1, 3, 3, 3, 3),
"sample3.a" = 3:7, "sample3.b" = 2)
If there is a zero in column .b, the correspsonding value in column .a should be set to NA.
I thought to write a function over colnames (without suffix) to filter each pair of columns and conditional exchaning values. Is there a simpler approach based on tidyverse?
We can split the data.frame into a list of data.frames and do the replacement in base R
df1 <- do.call(cbind, lapply(split.default(df,
sub("\\..*", "", names(df))), function(x) {
x[,1][x[2] == 0] <- NA
x}))
Or another option is Map
acols <- endsWith(names(df), "a")
bcols <- endsWith(names(df), "b")
df[acols] <- Map(function(x, y) replace(x, y == 0, NA), df[acols], df[bcols])
Or if the columns are alternate with 'a', 'b' columns, use a logical index for recycling, create the logical matrix with 'b' columns and assign the corresponding values in 'a' columns to NA
df[c(TRUE, FALSE)][df[c(FALSE, TRUE)] == 0] <- NA
or an option with tidyverse by reshaping into 'long' format (pivot_longer), changing the 'a' column to NA if there is a correspoinding 0 in 'a', and reshape back to 'wide' format with pivot_wider
library(dplyr)
library(tidyr)
df %>%
mutate(rn = row_number()) %>%
pivot_longer(cols = -rn, names_sep="\\.",
names_to = c('group', '.value')) %>%
mutate(a = na_if(b, a == 0)) %>%
pivot_wider(names_from = group, values_from = c(a, b)) %>%
select(-rn)
# A tibble: 5 x 6
# a_sample1 a_sample2 a_sample3 b_sample1 b_sample2 b_sample3
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 2 1 2 2 1 2
#2 2 3 2 2 3 2
#3 2 3 2 2 3 2
#4 2 3 2 2 3 2
#5 2 3 2 2 3 2

Using the value in one column to specify from which row to retrieve a value for a new column

I'm looking for an automated way of converting this:
dat = tribble(
~a, ~b, ~c
, 'x', 1, 'y'
, 'y', 2, NA
, 'q', 4, NA
, 'z', 3, 'q'
)
to:
tribble(
~a, ~b, ~d
, 'x', 1, 2
, 'z', 3, 4
)
So, the column c in dat encodes which row in dat to look at to grab a value for a new column d, and if c is NA, toss that row from the output. Any tips?
We can join dat with itself using c and a columns.
library(dplyr)
dat %>%
inner_join(dat %>% select(-c) %>% rename(d = 'b'),
by = c('c' = 'a'))
# A tibble: 2 x 4
# a b c d
# <chr> <dbl> <chr> <dbl>
#1 x 1 y 2
#2 z 3 q 4
In base R, we can do this with merge :
merge(dat, dat[-3], by.x = 'c', by.y = 'a')
We create the 'd' with lead of 'b' and filter out the NA rows of 'c' and remove the c column with select
library(dplyr)
dat %>%
mutate(d = lead(b)) %>%
filter(!is.na(c)) %>%
select(-c)
# A tibble: 2 x 3
# a b d
# <chr> <dbl> <dbl>
#1 x 1 2
#2 z 3 4
Or more compactly
dat %>%
mutate(d = replace(lead(b), is.na(c), NA), c = NULL) %>%
na.omit
Or with fill
library(tidyr)
dat %>%
mutate(c1 = c) %>%
fill(c1) %>%
group_by(c1) %>%
mutate(d = lead(b)) %>%
ungroup %>%
filter(!is.na(c)) %>%
select(-c, -c1)
Or in data.table
library(data.table)
setDT(dat)[, d := shift(b, type = 'lead')][!is.na(c)][, c := NULL][]
# a b d
#1: x 1 2
#2: z 3 4
NOTE: Both the solutions are simple and doesn't require any joins. Besides, it gives the expected output in the OP's post
Or using match from base R
cbind(na.omit(dat), d = with(dat, b[match(c, a, nomatch = 0)]))[, -3]
# a b d
#1 x 1 2
#2 z 3 4

How to create a tibble with hierarchically and arbitrary grouped variables mapped to unique values?

Is there an elegant and generic way to reproduce below tibble ?
data.frame(First=rep(c('A', 'B'), each=2),
Second=rep(c(1, 2), each=4),
Third=rep(c('true', 'false')),
Unique=1:8, stringsAsFactors = F) %>% as.tibble()
output:
<table><tbody><tr><th>First</th><th> Second</th><th> Third</th><th> Unique</th></tr><tr><td><chr> </td><td><int> </td><td><chr> </td><td><dbl></td></tr><tr><td>A</td><td>1 </td><td>true</td><td>1</td></tr><tr><td>A</td><td>1 </td><td>false</td><td>2</td></tr><tr><td>B</td><td>1 </td><td>true</td><td>3</td></tr><tr><td>B</td><td>1 </td><td>false</td><td>4</td></tr><tr><td>A</td><td>2 </td><td>true</td><td>5</td></tr><tr><td>A</td><td>2</td><td>false</td><td>6</td></tr><tr><td>B</td><td>2 </td><td>true</td><td>7</td></tr><tr><td>B</td><td>2</td><td>false</td><td>8</td></tr></tbody></table>
Using tidyverse functions, you can do :
tidyr::crossing(First = c('A', 'B'), Second = 1:2, Third = c(TRUE, FALSE)) %>%
dplyr::mutate(Unique = row_number())
# First Second Third Unique
# <chr> <int> <lgl> <int>
#1 A 1 FALSE 1
#2 A 1 TRUE 2
#3 A 2 FALSE 3
#4 A 2 TRUE 4
#5 B 1 FALSE 5
#6 B 1 TRUE 6
#7 B 2 FALSE 7
#8 B 2 TRUE 8
Hi this should do the work.
expand.grid(first = c('A', 'B'), second = 1:2, third = c("true", "false")) %>%
mutate(Unique = 1:n())

How to group the data by id and get unique values of all columns in R?

I have a table with ID and other columns. I want to group the data by Ids and get the unique values of all columns.
from above table group by ID and get unique(Alt1, Alt2, Alt3)
Resul should be in vector form
A -> 1,2,3,5
B ->1,3,4,5,7
We can get data in long format and for each ID make a list of unique values.
library(dplyr)
library(tidyr)
df1 <- df %>%
pivot_longer(cols = -ID) %>%
group_by(ID) %>%
summarise(value = list(unique(value))) %>%
unnest(value)
df1
# ID value
# <fct> <dbl>
# 1 A 1
# 2 A 3
# 3 A 2
# 4 A 5
# 5 B 1
# 6 B 4
# 7 B 5
# 8 B 3
# 9 B 6
#10 B 7
We can store it as a list if needed using split.
split(df1$value, df1$ID)
#$A
#[1] 1 3 2 5
#$B
#[1] 1 4 5 3 6 7
data.table equivalent of the above would be :
library(Data.table)
setDT(df)
df2 <- melt(df, id.vars = 'ID')[, .(value = list(unique(value))), ID]
unique values are present in df2$value as a vector.
data
df <- data.frame(ID = c('A', 'A', 'B', 'B'),
Alt1 = c(1, 2, 1, 3),
Alt2 = c(3, 5, 4, 6),
Alt3 = c(1, 3, 5, 7))

Resources