Move some observation to different rows - r

e.g.
I have this data frame:
a 1 2 3 4 5 6
b 7 8 9 1 2 3
c 4 5 6 7 8 9
and I want to transform it to:
a 1 2 3
a 4 5 6
b 7 8 9
b 1 2 3
c 4 5 6
c 7 8 9
Basically, for every single row, I want the e.g. 2nd to 6th, 7th to 11th, 11th to 15th... variable values to move a row below where the first variable value is the letter of the original row.
What should I do? I tried to work with gather() but that is not the case here.
I am just a beginner in R and would appreciate any help. thanks

This groups by 3 columns; just change the %/% 3 to %/% 5 for different numbers of columns. (This assumes that there are an equal number of columns in each grouping.)
out <- do.call(rbind, lapply(split.default(dat[,-1], (seq_along(dat[,-1])-1) %/% 3),
function(a) cbind(dat[,1,drop=FALSE], unname(a))))
out
# V1 1 2 3
# 0.1 a 1 2 3
# 0.2 b 7 8 9
# 0.3 c 4 5 6
# 1.1 a 4 5 6
# 1.2 b 1 2 3
# 1.3 c 7 8 9
And we can clean it up a little with
row.names(out) <- NULL
out[order(out[,1]),]
# V1 1 2 3
# 1 a 1 2 3
# 4 a 4 5 6
# 2 b 7 8 9
# 5 b 1 2 3
# 3 c 4 5 6
# 6 c 7 8 9

With tidyverse
Hard Code version
Isolate the first 4 columns (three values and the key) using select
rbind and select the last 3 columns and the key
EDIT Be sure to rename the columns that are to be combined with rbind to the same name as the select df from step 1 above.
arrange by the key
df %>%
select(1:4) %>%
rbind(df %>% select(1, "v1" = 5,
"v2" = 6,
"v3" = 7)) %>%
arrange(key)
# key v1 v2 v3
# 1 a 1 2 3
# 2 a 4 5 6
# 3 b 7 8 9
# 4 b 1 2 3
# 5 c 4 5 6
# 6 c 7 8 9
EDIT Generalized (a bit hairier)
sample df
set.seed(42)
df_2 <- tibble(
bug = letters,
col1 = sample(1:26),
col2 = sample(1:26),
col3 = sample(1:26),
col4 = sample(1:26),
col5 = sample(1:26),
col6 = sample(1:26),
col7 = sample(1:26),
col8 = sample(1:26),
col9 = sample(1:26),
col10 = sample(1:26),
col11 = sample(1:26),
col12 = sample(1:26),
col13 = sample(1:26),
col14 = sample(1:26),
col15 = sample(1:26)
)
A function that satisfies a generalized approach
create_rowgroups.f <- function(df, key, groupsize){
if( !(key %in% colnames(df)) ){
print(paste(key, "is expected to be a column in df"))
stop()
}
if( (ncol(df)-1) %% groupsize != 0 ){
print("Function requires groups to fit all variable columns with the exception of the key")
stop()
}
fnames = colnames(df[ , 1: (groupsize + 1) ])
df_2 <- bind_rows(lapply(
1:(ncol(df)/groupsize),
function(df, groupsize, key, fnames, index){
pos1 = (index * groupsize) - groupsize + 2
pos2 = (index * groupsize) + 1
tempdf <- df %>% select(!!key, !!pos1:!!pos2)
colnames(tempdf) = fnames
return(tempdf)
},
df = df,
key = key,
fnames = fnames,
groupsize = groupsize
))
df_2 <- df_2 %>% arrange(across(.cols = key))
return(df_2)
}
This is a lot of code that wraps around a simple line of lapply to assist the OP in using the UDF, create_rowgroups.f

If the data showed is the one showed, this can be done by easy subsetting of columns, and rbind the datasets after setting the column names same
library(dplyr)
rbind(df1[1:4], setNames(df1[c(1, 5:7)], names(df1)[1:4])) %>%
arrange(1)
If there are many columns, an automatic method that is easier to understand would be seq. Loop over the sequence of index, subset the columns, rbind within in do.call as we are creating a list by looping
out <- cbind(df1[1], do.call(rbind, lapply(seq(2, ncol(df1), by = 3),
function(i) setNames(df1[i:(i+2)], paste0("v", 2:4)))))
out[order(out$v1),]
data
df1 <- structure(list(v1 = c("a", "b", "c"), v2 = c(1L, 7L, 4L), v3 = c(2L,
8L, 5L), v4 = c(3L, 9L, 6L), v5 = c(4L, 1L, 7L), v6 = c(5L, 2L,
8L), v7 = c(6L, 3L, 9L)), class = "data.frame", row.names = c(NA,
-3L))

Here is another base R option
cbind(
df[rep(1:nrow(df), each = 2), ][1],
do.call(
"+",
lapply(0:1,
FUN = function(k) {
kronecker(
as.matrix(df[-1][(3*k-1)+2:4]),
(matrix(c(1, 0), nrow = 2) + k) %% 2
)
}
)
)
)
such that
V1 1 2 3
1 a 1 2 3
1.1 a 4 5 6
2 b 7 8 9
2.1 b 1 2 3
3 c 4 5 6
3.1 c 7 8 9

Related

Creating a new variable based on conditions of 3 other variables in R

I have a data set (n=500) in R that looks like this
ID A C S
1 4 4 4
2 3 2 3
3 5 4 2
Id like to create a new variable(I am calling this variable "same") that tells me whether any of my columns have the same value (excluding my ID column). So,
ID A C S Same
1 4 4 4 all
2 3 2 3 as
3 5 4 2 none
4 7 7 2 ac
Any help would be much appreciated! I am pretty lost! Thank you!
We may loop over the rows with apply (MARGIN = 1) with selected columns ([-1] without the 'ID' column), then check the length of unique elements, if it is 1, return 'all' or else paste the names of the duplicated elements. If there are no duplicates, then it returns blank "", change the blank to 'none'
df1$Same <- apply(df1[-1], 1, \(x) {
x1 <- if(length(unique(x)) == 1) 'all' else
paste(tolower(names(x))[duplicated(x)|duplicated(x,
fromLast = TRUE)], collapse = "")
x1[x1 == ""] <- "none"
x1})
-output
> df1
ID A C S Same
1 1 4 4 4 all
2 2 3 2 3 as
3 3 5 4 2 none
4 4 7 7 2 ac
data
df1 <- structure(list(ID = 1:4, A = c(4L, 3L, 5L, 7L), C = c(4L, 2L,
4L, 7L), S = c(4L, 3L, 2L, 2L)), class = "data.frame", row.names = c(NA,
-4L))
Try this using dplyr rowwise with rle
df |> rowwise() |> mutate(Same = case_when(length(rle(sort(c_across(A:S)))$values) == 1 ~ "all" ,
length(rle(sort(c_across(A:S)))$values) == 3 ~ "none" ,
c_across(A) == c_across(C) ~ "ac" ,
c_across(C) == c_across(S) ~ "cs" , TRUE ~ "as"))
output
# A tibble: 4 × 5
# Rowwise:
ID A C S Same
<int> <int> <int> <int> <chr>
1 1 4 4 4 all
2 2 3 2 3 as
3 3 5 4 2 none
4 4 7 7 2 ac

Nested full_join with suffixes for more than 2 data.frames

I want to merge several data.frames with some common columns and append a suffix to the column names to keep track from where does the data for each column come from.
I can do it easily with the suffix term in the first full_join, but when I do the second join, no suffixes are added. I can rename the third data.frame so it has suffixes, but I wanted to know if there is another way of doing it using the suffix term.
Here is an example code:
x = data.frame(col1 = c("a","b","c"), col2 = 1:3, col3 = 1:3)
y = data.frame(col1 = c("b","c","d"), col2 = 4:6, col3 = 1:3)
z = data.frame(col1 = c("c","d","a"), col2 = 7:9, col3 = 1:3)
> df = full_join(x, y, by = "col1", suffix = c("_x","_y")) %>%
full_join(z, by = "col1", suffix = c("","_z"))
> df
col1 col2_x col3_x col2_y col3_y col2 col3
1 a 1 1 NA NA 9 3
2 b 2 2 4 1 NA NA
3 c 3 3 5 2 7 1
4 d NA NA 6 3 8 2
I was expecting that col2 and col3 from data.frame z would have a "_z" suffix. I have tried using empty suffixes while merging two data.frames and it works.
I can work around by renaming the columns in z before doing the second full_join, but in my real data I have several common columns, and if I wanted to merge more data.frames it would complicate the code. This is my expected output.
> colnames(z) = paste0(colnames(z),"_z")
> df = full_join(x, y, by = "col1", suffix = c("_x","_y")) %>%
full_join(z, by = c("col1"="col1_z"))
> df
col1 col2_x col3_x col2_y col3_y col2_z col3_z
1 a 1 1 NA NA 9 3
2 b 2 2 4 1 NA NA
3 c 3 3 5 2 7 1
4 d NA NA 6 3 8 2
I have seen other similar problems in which adding an extra column to keep track of the source data.frame is used, but I was wondering why does not the suffix term work with multiple joins.
PS: If I keep the first suffix empty, I can add suffixes in the second join, but that will leave the col2 and col3 form x without suffix.
> df = full_join(x, y, by = "col1", suffix = c("","_y")) %>%
full_join(z, by = "col1", suffix = c("","_z"))
> df
col1 col2 col3 col2_y col3_y col2_z col3_z
1 a 1 1 NA NA 9 3
2 b 2 2 4 1 NA NA
3 c 3 3 5 2 7 1
4 d NA NA 6 3 8 2
You can do it like this:
full_join(x, y, by = "col1", suffix = c("","_y")) %>%
full_join(z, by = "col1", suffix = c("_x","_z"))
col1 col2_x col3_x col2_y col3_y col2_z col3_z
1 a 1 1 NA NA 9 3
2 b 2 2 4 1 NA NA
3 c 3 3 5 2 7 1
4 d NA NA 6 3 8 2
Adding the suffix for xat the last join should do the trick.

How to include exception when using fill everything?

I'm merging two data frames as follows:
data_merged <- full_join(df1, df2, by=c("col1","col2")) %>%
fill(everything(), .direction = 'down')
However, there is a column in the new merged data frame that I don't want to fill (say, col3). This row needs to retain its NA value. I've tried doing this with select but failed, and also thought of maybe working around with making part of it a tibble but can't capitalize on the idea.
Does anybody have any ideas?
Try this:
data.frame(col1 = 1:10, col2 = c(1, NA), col3 = c(2,NA))%>%
fill(!col3, .direction = 'down')
# col1 col2 col3
# 1 1 1 2
# 2 2 1 NA
# 3 3 1 2
# 4 4 1 NA
# 5 5 1 2
# 6 6 1 NA
# 7 7 1 2
# 8 8 1 NA
# 9 9 1 2
# 10 10 1 NA
We can also use na.locf from zoo
library(zoo)
df1$col3 <- na.locf0(df1$col3)
data
df1 <- data.frame(col1 = 1:10, col2 = c(1, NA), col3 = c(2,NA))

Converting single row to multiple rows, ignoring NAs

I have the following data-set
ID COL1 COL2 COL3
1 22 12 NA
2 2 NA NA
3 1 2 4
4 NA NA NA
The above data needs to be converted into the following format
ID VALUE
1 22
1 12
2 2
3 1
3 2
3 4
Please note that NAs are present in the source data frame which should be ignored in the final table.
For speed with the larger datasets, use the data.table melt method:
library("data.table")
setDT(df)
melt(df, id.vars = "ID", na.rm = TRUE)
# ID variable value
# 1: 1 COL1 22
# 2: 2 COL1 2
# 3: 3 COL1 1
# 4: 1 COL2 12
# 5: 3 COL2 2
# 6: 3 COL3 4
library(dplyr)
library(tidyr)
gather(df, column, value, COL1:COL3, na.rm=TRUE) %>%
select(-column)
In base R, you could use lapply to go through columns and extract non NA elements and corresponding ID.
do.call(rbind, lapply(df[,-1], function(x)
data.frame(ID = df$ID[!is.na(x)], VALUE = x[!is.na(x)])))
# ID VALUE
#COL1.1 1 22
#COL1.2 2 2
#COL1.3 3 1
#COL2.1 1 12
#COL2.2 3 2
#COL3 3 4
If necessary, the order can be changed in one additional step
df2 = do.call(rbind, lapply(df[,-1], function(x)
data.frame(ID = df$ID[!is.na(x)], VALUE = x[!is.na(x)])))
do.call(rbind, split(df2, df2$ID))
# ID VALUE
#1.COL1.1 1 22
#1.COL2.1 1 12
#2 2 2
#3.COL1.3 3 1
#3.COL2.2 3 2
#3.COL3 3 4
DATA
df = structure(list(ID = 1:4, COL1 = c(22L, 2L, 1L, NA), COL2 = c(12L,
NA, 2L, NA), COL3 = c(NA, NA, 4L, NA)), .Names = c("ID", "COL1",
"COL2", "COL3"), class = "data.frame", row.names = c(NA, -4L))
Here is a base R option
d1 <- na.omit(data.frame(ID = rep(df1$ID, each = ncol(df1)-1), VALUE = c(t(df1[-1]))))
d1
# ID VALUE
#1 1 22
#2 1 12
#4 2 2
#7 3 1
#8 3 2
#9 3 4
Or we can use a compact option with data.table
library(data.table)
setDT(df1)[, unlist(.SD), .(ID)][!is.na(V1)]

find only the group members in r

I'm stuck with defining group members to an individual. I was working in excel but that is failing since the number of individuals in a group varies between groups. I used this formula
=IFERROR(INDEX($A$1:$A$10727;SMALL(IF($S$1:$S$10727=$S2;ROW($S$1:$S$10727);"");Nth);1);"NA")
This returns the Nth individual in a group. This is not working since gives me all the individuals and I only want the group member, so not the individuals itself. So I was thinking to go to R, but I don't know where to start.
My data looks like this:
group ID
1 1
1 2
1 3
2 4
2 5
3 6
3 7
3 8
3 9
3 10
I would like this:
group ID gm1 gm2 gm3 gm4
1 1 2 3 NA NA
1 2 1 3 NA NA
1 3 1 2 NA NA
2 4 5 NA NA NA
2 5 4 NA NA NA
3 6 7 8 9 10
3 7 6 8 9 10
3 8 6 7 9 10
3 9 6 7 8 10
3 10 6 7 8 9
Is there a formula in R that gives me the group members?
We can do this with combn and cSplit
library(splitstackshape)
df1$gm <- unlist(unsplit(lapply(split(df1$ID, df1$group), function(x)
lapply(x, function(y) {
i1 <- x[y!= x]
if(length(i1) >1) combn(i1, length(i1), FUN = paste, collapse=", ") else i1
})), df1$group))
cSplit(df1, 'gm', ', ')
# group ID gm_1 gm_2 gm_3 gm_4
# 1: 1 1 2 3 NA NA
# 2: 1 2 1 3 NA NA
# 3: 1 3 1 2 NA NA
# 4: 2 4 5 NA NA NA
# 5: 2 5 4 NA NA NA
# 6: 3 6 7 8 9 10
# 7: 3 7 6 8 9 10
# 8: 3 8 6 7 9 10
# 9: 3 9 6 7 8 10
#10: 3 10 6 7 8 9
Or the same can be implemented with data.table and cSplit
library(data.table)
cSplit(setDT(df1)[, gm := unlist(lapply(seq_len(.N), function(i) {
i1 <- ID[i != seq_len(.N)]
if(length(i1) > 1) combn(i1, length(i1), FUN =paste, collapse=", ")
else as.character(i1)})), group], 'gm', ', ')
data
df1 <- structure(list(group = c(1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L, 3L,
3L), ID = 1:10), .Names = c("group", "ID"), class = "data.frame", row.names = c(NA,
-10L))
Using dplyr and tidyr you could solve this in the following way. First we define a function that solves the problem for a single group, then we simply apply this function to all the groups using do.
library(dplyr)
df <- data.frame(group = rep(1:3, c(3, 2, 5)), ID = 1:10)
add_group_members <- function(df) {
df_copy <- df
colnames(df_copy)[2] <- "gm_id"
inner_join(df, df_copy, by = c("group" = "group")) %>%
filter(ID != gm_id) %>%
group_by(ID) %>%
mutate(gm = paste("gm", row_number(), sep = '')) %>%
tidyr::spread(key = gm, value = gm_id) %>% ungroup
}
df %>% group_by(group) %>% do(add_group_members(.)) %>% ungroup
Another tidyverse solution:
df <- data.frame(x = rep(1:3, c(3, 2, 5)), id = 1:10)
library(tidyverse)
df2 <-
df %>%
group_by(x) %>%
mutate(unique = paste(unique(id), collapse = ","))
df2$group_unique <- map_chr(seq_len(nrow(df2)), function(index) {
row_unique <- as.numeric(strsplit(df2[[index, "unique"]], ",")[[1]])
paste0(setdiff(row_unique, df2[[index, "id"]]), collapse = ",")
})
df2 %>%
select(-unique) %>%
separate(group_unique, paste("gm_", 1:(max(table(df$x)) - 1)))

Resources