Suppose I have a dataframe:
dw <- read.table(header=T, text='
ID q1 q2 q3 q4 q5 ...q10
A 10 6 50 10 bA
B 12 5 70 11 bB
C 20 7 20 8 bC
D 22 8 22 9 bD
')
I would like to move every 2 columns after 'ID' to new rows so it looks like:
ID q1 q2
A 10 6
B 12 5
C 20 7
D 22 8
A 50 10
B 70 11
C 20 8
D 22 9
....
pivot_longer seems to move every single column instead of multiple columns?
It seems that you are not concerned with the column names (other than ID), and that they are all the same class. For this, we can "pivot" manually, without the safeguards or power of pivot_lower perhaps, but without the requirements as well.
The first step is to make sure that class won't be an issue; because you have some strings in there, we need to convert all to character:
dw[-1] <- lapply(dw[-1], as.character)
After that, we can manually extract every two (non-ID) columns and combine with ID:
cols <- seq_along(dw)[-1]
list_of_frames <- lapply(split(cols, cols %/% 2), function(ind) setNames(dw[,c(1, ind)], c("ID", "q1", "q2")))
list_of_frames
# $`1`
# ID q1 q2
# 1 A 10 6
# 2 B 12 5
# 3 C 20 7
# 4 D 22 8
# $`2`
# ID q1 q2
# 1 A 50 10
# 2 B 70 11
# 3 C 20 8
# 4 D 22 9
# $`3`
# ID q1 q2
# 1 A bA zA
# 2 B bB zB
# 3 C bC zC
# 4 D bD zD
This can be easily combined with several methods, choose one of:
data.table::rbindlist(list_of_frames)
dplyr::bind_rows(list_of_frames)
do.call(rbind, list_of_frames)
# ID q1 q2
# 1 A 10 6
# 2 B 12 5
# 3 C 20 7
# 4 D 22 8
# 5 A 50 10
# 6 B 70 11
# 7 C 20 8
# 8 D 22 9
# 9 A bA zA
# 10 B bB zB
# 11 C bC zC
# 12 D bD zD
Data
dw <- structure(list(ID = c("A", "B", "C", "D"), q1 = c("10", "12", "20", "22"), q2 = c("6", "5", "7", "8"), q3 = c("50", "70", "20", "22"), q4 = c("10", "11", "8", "9"), q5 = c("bA", "bB", "bC", "bD"), q6 = c("zA", "zB", "zC", "zD")), row.names = c(NA, -4L), class = "data.frame")
Another option:
data.frame(ID = dw$ID,
q1 = unlist(dw[,seq(2, ncol(dw), 2)], use.names = FALSE),
q2 = unlist(dw[,seq(3, ncol(dw), 2)], use.names = FALSE))
With data:
dw <- structure(list(ID = c("A", "B", "C", "D"),
q1 = c(10L, 12L, 20L, 22L),
q2 = c(6L, 5L, 7L, 8L),
q3 = c(50L, 70L, 20L, 22L),
q4 = c(10L, 11L, 8L, 9L),
q5 = c("bA", "bB", "bC", "bD"),
q6 = c("cc", "dd", "ee", "ff"))
, class = "data.frame", row.names = c(NA, -4L))
data.frame(ID = dw$ID,
q1 = unlist(dw[,seq(2, ncol(dw), 2)], use.names = FALSE),
q2 = unlist(dw[,seq(3, ncol(dw), 2)], use.names = FALSE))
#> ID q1 q2
#> 1 A 10 6
#> 2 B 12 5
#> 3 C 20 7
#> 4 D 22 8
#> 5 A 50 10
#> 6 B 70 11
#> 7 C 20 8
#> 8 D 22 9
#> 9 A bA cc
#> 10 B bB dd
#> 11 C bC ee
#> 12 D bD ff
Or more generally:
n <- 3L # operate on every 3 columns
data.frame(
setNames(
c(
list(dw[,1]),
lapply(
2:(n + 1L),
function(i) unlist(dw[,seq(i, ncol(dw), n)], TRUE, FALSE)
)
),
names(dw)[1:(n + 1L)]
)
)
#> ID q1 q2 q3
#> 1 A 10 6 50
#> 2 B 12 5 70
#> 3 C 20 7 20
#> 4 D 22 8 22
#> 5 A 10 bA cc
#> 6 B 11 bB dd
#> 7 C 8 bC ee
#> 8 D 9 bD ff
The melt(...) method for data.table allows for melting groups of columns. Using dw from #r2evans answer:
library(data.table)
setDT(dw)
result <- melt(dw, measure.vars = list(seq(2, ncol(dw), 2), seq(3, ncol(dw), 2)))
result[, variable:=NULL]
result
## ID value1 value2
## 1: A 10 6
## 2: B 12 5
## 3: C 20 7
## 4: D 22 8
## 5: A 50 10
## 6: B 70 11
## 7: C 20 8
## 8: D 22 9
## 9: A bA zA
## 10: B bB zB
## 11: C bC zC
## 12: D bD zD
melt(...) introduces a column variable which keeps track of the location of the original columns in the wide dataset. You don't seem to care about that so it's removed. If there are indeed different classes (integer, character) melt(...) will take care of that with a warning.
Related
I have a large data set:
> ncol(d) [1] 1680 nrow(d) [1] 12
that it looks like this:
a b c e f g
3 2 5 1 3 6
a b c d e g
1 7 8 4 5 8
a c d e f h #in this row b does not exist
5 10 4 7 5 10
And I need that it looks like this:
a b c d e f g h
3 2 5 0 3 6 10 8
1 7 8 4 5 0 8 0
5 0 10 4 7 5 0 10 #and all the other columns ...
Since my data is really long and I have many corrections like this one to do over all the data set, it is hard to do it by hand. I would like to know if there is any way to do this using some sort of automatic way, like a logic function or a loop.
Any idea is welcome
Regards
Here's a possible approach using data.table:
library(data.table)
melt(
setDT(
setnames(
data.table::transpose(df1),
paste(rep(1:(nrow(df1)/2), each = 2), c("name", "value"), sep = "_"))),
measure = patterns("name", "value"))[
, dcast(.SD, variable ~ value1, value.var = "value2", fill = 0)]
# variable a b c d e f g h
# 1: 1 3 2 5 0 1 3 6 0
# 2: 2 1 7 8 4 5 0 8 0
# 3: 3 5 0 10 4 7 5 0 10
We could get the alternate rows with recycling logical vector, construct a data.frame and pivot it to wide format with pivot_wider
library(dplyr)
library(tidyr)
library(data.table)
sub1 <- df1[c(TRUE, FALSE),]
sub2 <- df1[c(FALSE, TRUE),]
tibble(ind = c(row(sub1)), col1 = factor(unlist(sub1), levels = letters[1:8]),
col2 = as.integer(unlist(sub2))) %>%
pivot_wider(names_from = col1, values_from = col2,
values_fill = list(col2 = 0)) %>%
select(-ind)
#A tibble: 3 x 8
# a b c d e f g h
# <int> <int> <int> <int> <int> <int> <int> <int>
#1 3 2 5 0 1 3 6 0
#2 1 7 8 4 5 0 8 0
#3 5 0 10 4 7 5 0 10
Or using base R with reshape
out <- reshape(
data.frame(ind = c(row(sub1)),
col1 = factor(unlist(sub1), levels = letters[1:8]),
col2 = as.integer(unlist(sub2))),
idvar = 'ind', direction = 'wide', timevar = 'col1')[-1]
names(out) <- sub("col2\\.", "", names(out))
out[is.na(out)] <- 0
row.names(out) <- NULL
out
# a b c d e f g h
#1 3 2 5 0 1 3 6 0
#2 1 7 8 4 5 0 8 0
#3 5 0 10 4 7 5 0 10
data
df1 <- structure(list(v1 = c("a", "3", "a", "1", "a", "5"), v2 = c("b",
"2", "b", "7", "c", "10"), v3 = c("c", "5", "c", "8", "d", "4"
), v4 = c("e", "1", "d", "4", "e", "7"), v5 = c("f", "3", "e",
"5", "f", "5"), v6 = c("g", "6", "g", "8", "h", "10")), class = "data.frame",
row.names = c(NA,
-6L))
I have a list of dataframes and my goal it is transpose them to bind into one. How could i do this? Below it is my list
$pri
$pri$x
a b
1 1 3
2 2 4
$pri$y
a b c
1 1 3 5
2 2 4 6
$sec
$sec$w
a b
1 7 9
2 8 10
$sec$z
a b c d
1 11 13 15 17
2 12 14 16 18
I aim the output like this
"col1" "col2"
a ; 1 ; 2
b ; 3 ; 4
a ; 1 ; 2
b ; 3 ; 4
c ; 5 ; 6
a ; 7 ; 8
b ; 9 ; 10
a ; 11 ; 12
b ; 13 ; 14
c ; 15 ; 16
d ; 17 ; 18
library(purrr)
pri <-
list(
x = data.frame(a = 1:2, b = 3:4),
y = data.frame(a = 1:2, b = 3:4, c = 5:6)
)
sec <-
list(
w = data.frame(a = 7:8, b = 9:10),
z = data.frame(a = 11:12, b = 13:14, c = 15:16, d = 17:18)
)
list(pri = pri, sec = sec) %>% flatten() %>% map(t) %>% reduce(rbind)
#> [,1] [,2]
#> a 1 2
#> b 3 4
#> a 1 2
#> b 3 4
#> c 5 6
#> a 7 8
#> b 9 10
#> a 11 12
#> b 13 14
#> c 15 16
#> d 17 18
Created on 2020-03-12 by the reprex package (v0.3.0)
Assuming your data given like this (According to your question):
frame_list <- list(pri = list(x = structure(list(a = 1:2, b = 3:4), class = "data.frame", row.names = c(NA,
-2L)), y = structure(list(a = 1:2, b = 3:4, c = 5:6), class = "data.frame", row.names = c(NA,
-2L))), sec = list(w = structure(list(a = 7:8, b = 9:10), class = "data.frame", row.names = c(NA,
-2L)), z = structure(list(a = 11:12, b = 13:14, c = 15:16, d = 17:18), class = "data.frame", row.names = c(NA,
-2L))))
then you can do:
df <- t(do.call('cbind', unlist(frame_list, recursive = FALSE)))
rownames(df) <- gsub('\\w+\\.\\w\\.', '', rownames(df))
Note: The output will be a matrix, In case you need to convert this to a dataframe, you can use data.frame but this will change the rownames by appending some number to it to make it unique.
Output:
[,1] [,2]
a 1 2
b 3 4
a 1 2
b 3 4
c 5 6
a 7 8
b 9 10
a 11 12
b 13 14
c 15 16
d 17 18
In case you want it into a dataframe, then you can do:
df <- data.frame(t(do.call('cbind', unlist(frame_list, recursive = FALSE))), stringsAsFactors = FALSE)
df$newcol <- gsub('\\w+\\.\\w\\.', '', rownames(df))
rownames(df) <- NULL
Output:
X1 X2 newcol
1 1 2 a
2 3 4 b
3 1 2 a
4 3 4 b
5 5 6 c
6 7 8 a
7 9 10 b
8 11 12 a
9 13 14 b
10 15 16 c
11 17 18 d
you could also solve your problem using base R functions as follows:
dfs <- list(pri = list(x = structure(list(a = 1:2, b = 3:4), class = "data.frame", row.names = c(NA,
-2L)), y = structure(list(a = 1:2, b = 3:4, c = 5:6), class = "data.frame", row.names = c(NA,
-2L))), sec = list(w = structure(list(a = 7:8, b = 9:10), class = "data.frame", row.names = c(NA,
-2L)), z = structure(list(a = 11:12, b = 13:14, c = 15:16, d = 17:18), class = "data.frame", row.names = c(NA,
-2L))))
t(Reduce(cbind, unlist(dfs, FALSE)))
# [,1] [,2]
# a 1 2
# b 3 4
# a 1 2
# b 3 4
# c 5 6
# a 7 8
# b 9 10
# a 11 12
# b 13 14
# c 15 16
# d 17 18
I have this data frame:
df <- data.frame(
id = rep(1:4, each = 4),
status = c(
NA, "a", "c", "a",
NA, "b", "c", "c",
NA, NA, "a", "c",
NA, NA, "b", "b"),
stringsAsFactors = FALSE)
For each group (id), I aim to remove the rows with one or multiple leading NA in front of an "a" (in the column "status") but not in front of a "b".
The final data frame should look like this:
structure(list(
id = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 4L, 4L, 4L, 4L),
status = c("a", "c", "a", NA, "b", "c", "c", "a", "c", NA, NA, "b", "b")),
.Names = c("id", "status"), row.names = c(NA, -13L), class = "data.frame")
How do I do that?
Edit: alternatively, how would I do it to preserve other variables in the data frame such as the variable otherVar in the following example:
df2 <- data.frame(
id = rep(1:4, each = 4),
status = c(
NA, "a", "c", "a",
NA, "b", "c", "c",
NA, NA, "a", "c",
NA, NA, "b", "b"),
otherVar = letters[1:16],
stringsAsFactors = FALSE)
We can group by 'id', summarise the 'status' by pasteing the elements together, then use gsub to remove the NA before the 'a' and convert it to 'long' format with separate_rows
library(dplyr)
library(tidyr)
df %>%
group_by(id) %>%
summarise(status = gsub("(NA, ){1,}(?=a)", "", toString(status),
perl = TRUE)) %>%
separate_rows(status, convert = TRUE)
# A tibble: 13 x 2
# id status
# <int> <chr>
# 1 1 a
# 2 1 c
# 3 1 a
# 4 2 NA
# 5 2 b
# 6 2 c
# 7 2 c
# 8 3 a
# 9 3 c
#10 4 NA
#11 4 NA
#12 4 b
#13 4 b
Or using data.table with the same methodology
library(data.table)
out1 <- setDT(df)[, strsplit(gsub("(NA, ){1,}(?=a)", "",
toString(status), perl = TRUE), ", "), id]
setnames(out1, 'V1', "status")[]
# id status
# 1: 1 a
# 2: 1 c
# 3: 1 a
# 4: 2 NA
# 5: 2 b
# 6: 2 c
# 7: 2 c
# 8: 3 a
# 9: 3 c
#10: 4 NA
#11: 4 NA
#12: 4 b
#13: 4 b
Update
For the updated dataset 'df2'
i1 <- setDT(df2)[, .I[seq(which(c(diff((status %in% "a") +
rleid(is.na(status))) > 1), FALSE))] , id]$V1
df2[-i1]
# id status otherVar
# 1: 1 a b
# 2: 1 c c
# 3: 1 a d
# 4: 2 NA e
# 5: 2 b f
# 6: 2 c g
# 7: 2 c h
# 8: 3 a k
# 9: 3 c l
#10: 4 NA m
#11: 4 NA n
#12: 4 b o
#13: 4 b p
From zoo with na.locf and is.na, notice it assuming you data is ordered.
df[!(na.locf(df$status,fromLast = T)=='a'&is.na(df$status)),]
id status
2 1 a
3 1 c
4 1 a
5 2 <NA>
6 2 b
7 2 c
8 2 c
11 3 a
12 3 c
13 4 <NA>
14 4 <NA>
15 4 b
16 4 b
Here's a dplyr solution and a not as pretty base translation :
dplyr
library(dplyr)
df %>% group_by(id) %>%
filter(status[!is.na(status)][1]!="a" | !is.na(status))
# # A tibble: 13 x 2
# # Groups: id [4]
# id status
# <int> <chr>
# 1 1 a
# 2 1 c
# 3 1 a
# 4 2 <NA>
# 5 2 b
# 6 2 c
# 7 2 c
# 8 3 a
# 9 3 c
# 10 4 <NA>
# 11 4 <NA>
# 12 4 b
# 13 4 b
base
do.call(rbind,
lapply(split(df,df$id),
function(x) x[x$status[!is.na(x$status)][1]!="a" | !is.na(x$status),]))
# id status
# 1.2 1 a
# 1.3 1 c
# 1.4 1 a
# 2.5 2 <NA>
# 2.6 2 b
# 2.7 2 c
# 2.8 2 c
# 3.11 3 a
# 3.12 3 c
# 4.13 4 <NA>
# 4.14 4 <NA>
# 4.15 4 b
# 4.16 4 b
note
Will fail if not all NAs are leading because will remove all NAs from groups starting with "a" as a first non NA value.
I have a dataframe as follows:
df1
ColA ColB ColC ColD
10 A B L
11 N Q NA
12 P J L
43 M T NA
89 O J T
df2
ATTR Att R1 R2 R3 R4
1 45 A B NA NA
2 40 C D NA NA
3 33 T J O NA
4 65 L NA NA NA
5 20 P L J NA
6 23 Q NA NA NA
7 38 Q L NA NA
How do I match up df2 with df1 so that if ALL the values in each df2 row (disregarding the order) show up in the df1 rows, then it will populate. So it is checking if ALL not just one value from each df2 row matches up with each df1 row. The final result in this case should be this:
ColA ColB ColC ColD ATTR Att R1 R2 R3 R4
10 A B L 1 45 A B NA NA
10 A B L 4 65 L NA NA NA
11 N Q NA 6 23 Q NA NA NA
12 P J L 4 65 L NA NA NA
12 P J L 5 20 P L J NA
89 O J T 3 33 T J O NA
Thanks
Here is a possible solution using base R.
Make sure everything is a character before continuing, i.e.
df[-1] <- lapply(df[-1], as.character)
df1[-c(1:2)] <- lapply(df1[-c(1:2)], as.character)
First we create two lists which contain vectors of the rowwise elements of each data frame. We then create a matrix with the length of elements from l2 are found in l1, If the length is 0 then it means they match. i.e,
l1 <- lapply(split(df[-1], seq(nrow(df))), function(i) i[!is.na(i)])
l2 <- lapply(split(df1[-c(1:2)], seq(nrow(df1))), function(i) i[!is.na(i)])
m1 <- sapply(l1, function(i) sapply(l2, function(j) length(setdiff(j, i))))
m1
# 1 2 3 4 5
#1 0 2 2 2 2
#2 2 2 2 2 2
#3 3 3 2 2 0
#4 0 1 0 1 1
#5 2 3 0 3 2
#6 1 0 1 1 1
#7 1 1 1 2 2
We then use that matrix to create a couple of coloumns in our original df. The first column rpt will indicate how many times each row has length 0 and use that as a number of repeats for each row. We also use it to filter out all the 0 lengths (i.e. the rows that do not have a match with df1). After expanding the data frame we create another variable; ATTR (same name as ATTR in df1) in order to use it for a merge. i.e.
df$rpt <- colSums(m1 == 0)
df <- df[df$rpt != 0,]
df <- df[rep(row.names(df), df$rpt),]
df$ATTR <- which(m1 == 0, arr.ind = TRUE)[,1]
df
# ColA ColB ColC ColD rpt ATTR
#1 10 A B L 2 1
#1.1 10 A B L 2 4
#2 11 N Q <NA> 1 6
#3 12 P J L 2 4
#3.1 12 P J L 2 5
#5 89 O J T 1 3
We then merge and order the two data frames,
final_df <- merge(df, df1, by = 'ATTR')
final_df[order(final_df$ColA),]
# ATTR ColA ColB ColC ColD rpt Att R1 R2 R3 R4
#1 1 10 A B L 2 45 A B <NA> <NA>
#3 4 10 A B L 2 65 L <NA> <NA> <NA>
#6 6 11 N Q <NA> 1 23 Q <NA> <NA> <NA>
#4 4 12 P J L 2 65 L <NA> <NA> <NA>
#5 5 12 P J L 2 20 P L J <NA>
#2 3 89 O J T 1 33 T J O <NA>
DATA
dput(df)
structure(list(ColA = c(10L, 11L, 12L, 43L, 89L), ColB = c("A",
"N", "P", "M", "O"), ColC = c("B", "Q", "J", "T", "J"), ColD = c("L",
NA, "L", NA, "T")), .Names = c("ColA", "ColB", "ColC", "ColD"
), row.names = c(NA, -5L), class = "data.frame")
dput(df1)
structure(list(ATTR = 1:7, Att = c(45L, 40L, 33L, 65L, 20L, 23L,
38L), R1 = c("A", "C", "T", "L", "P", "Q", "Q"), R2 = c("B",
"D", "J", NA, "L", NA, "L"), R3 = c(NA, NA, "O", NA, "J", NA,
NA), R4 = c(NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_)), .Names = c("ATTR",
"Att", "R1", "R2", "R3", "R4"), row.names = c(NA, -7L), class = "data.frame")
I would like to splite each row of a data frame(numberic) into two rows. For example, part of the original data frame like this (nrow(original datafram) > 2800000):
ID X Y Z value_1 value_2
1 3 2 6 22 54
6 11 5 9 52 71
3 7 2 5 2 34
5 10 7 1 23 47
And after spliting each row, we can get:
ID X Y Z
1 3 2 6
22 54 NA NA
6 11 5 9
52 71 NA NA
3 7 2 5
2 34 NA NA
5 10 7 1
23 47 NA NA
the "value_1" and "value_2" columns are split and each element is set to a new row. For example, value_1 = 22 and value_2 = 54 are set to a new row.
Here is one option with data.table. We convert the 'data.frame' to 'data.table' by creating a column of rownames (setDT(df1, keep.rownames = TRUE)). Subset the columns 1:5 and 1, 6, 7 in a list, rbind the list element with fill = TRUE option to return NA for corresponding columns that are not found in one of the datasets, order by the row number ('rn') and assign (:=) the row number column to 'NULL'.
library(data.table)
setDT(df1, keep.rownames = TRUE)[]
rbindlist(list(df1[, 1:5, with = FALSE], setnames(df1[, c(1, 6:7),
with = FALSE], 2:3, c("ID", "X"))), fill = TRUE)[order(rn)][, rn:= NULL][]
# ID X Y Z
#1: 1 3 2 6
#2: 22 54 NA NA
#3: 6 11 5 9
#4: 52 71 NA NA
#5: 3 7 2 5
#6: 2 34 NA NA
#7: 5 10 7 1
#8: 23 47 NA NA
A hadleyverse corresponding to the above logic would be
library(dplyr)
tibble::rownames_to_column(df1[1:4]) %>%
bind_rows(., setNames(tibble::rownames_to_column(df1[5:6]),
c("rowname", "ID", "X"))) %>%
arrange(rowname) %>%
select(-rowname)
# ID X Y Z
#1 1 3 2 6
#2 22 54 NA NA
#3 6 11 5 9
#4 52 71 NA NA
#5 3 7 2 5
#6 2 34 NA NA
#7 5 10 7 1
#8 23 47 NA NA
data
df1 <- structure(list(ID = c(1L, 6L, 3L, 5L), X = c(3L, 11L, 7L, 10L
), Y = c(2L, 5L, 2L, 7L), Z = c(6L, 9L, 5L, 1L), value_1 = c(22L,
52L, 2L, 23L), value_2 = c(54L, 71L, 34L, 47L)), .Names = c("ID",
"X", "Y", "Z", "value_1", "value_2"), class = "data.frame",
row.names = c(NA, -4L))
Here's a (very slow) pure R solution using no extra packages:
# Replicate your matrix
input_df <- data.frame(ID = rnorm(10000),
X = rnorm(10000),
Y = rnorm(10000),
Z = rnorm(10000),
value_1 = rnorm(10000),
value_2 = rnorm(10000))
# Preallocate memory to a data frame
output_df <- data.frame(
matrix(
nrow = nrow(input_df)*2,
ncol = ncol(input_df)-2))
# Loop through each row in turn.
# Put the first four elements into the current
# row, and the next two into the current+1 row
# with two NAs attached.
for(i in seq(1, nrow(output_df), 2)){
output_df[i,] <- input_df[i, c(1:4)]
output_df[i+1,] <- c(input_df[i, c(5:6)],NA,NA)
}
colnames(output_df) <- c("ID", "X", "Y", "Z")
Which results in
> head(output_df)
X1 X2 X3 X4
1 0.5529417 -0.93859275 2.0900276 -2.4023800
2 0.9751090 0.13357075 NA NA
3 0.6753835 0.07018647 0.8529300 -0.9844643
4 1.6405939 0.96133195 NA NA
5 0.3378821 -0.44612782 -0.8176745 0.2759752
6 -0.8910678 -0.37928353 NA NA
This should work
data <- read.table(text= "ID X Y Z value_1 value_2
1 3 2 6 22 54
6 11 5 9 52 71
3 7 2 5 2 34
5 10 7 1 23 47", header=T)
data1 <- data[,1:4]
data2 <- setdiff(data,data1)
names(data2) <- names(data1)[1:ncol(data2)]
combined <- plyr::rbind.fill(data1,data2)
n <- nrow(data1)
combined[kronecker(1:n, c(0, n), "+"),]
Though why you would need to do this beats me.