How to stack multiple columns using tidyverse - r

I have a data frame like this in wide format
setseed(1)
df = data.frame(item=letters[1:6], field1a=sample(6,6),field1b=sample(60,6),
field1c=sample(200,6),field2a=sample(6,6),field2b=sample(60,6),
field2c=sample(200,6))
what would be the best way to stack all a columns together and all b together and all c together like this
items fielda fieldb fieldc
a 2 52 121
a 1 44 57

using base R:
cbind(item=df$item,unstack(transform(stack(df,-1),ind=sub("\\d+","",ind))))
item fielda fieldb fieldc
1 a 2 57 138
2 b 6 39 77
3 c 3 37 153
4 d 4 4 99
5 e 1 12 141
6 f 5 10 194
7 a 3 17 97
8 b 4 23 120
9 c 5 1 98
10 d 1 22 37
11 e 2 49 163
12 f 6 19 131
Or you can use the reshape function in Base R:
reshape(df,varying = split(names(df)[-1],rep(1:3,2)),idvar = "item",direction = "long")
item time field1a field1b field1c
a.1 a 1 2 57 138
b.1 b 1 6 39 77
c.1 c 1 3 37 153
d.1 d 1 4 4 99
e.1 e 1 1 12 141
f.1 f 1 5 10 194
a.2 a 2 3 17 97
b.2 b 2 4 23 120
c.2 c 2 5 1 98
d.2 d 2 1 22 37
e.2 e 2 2 49 163
f.2 f 2 6 19 131
You can also decide to separate the name of the dataframe by yourself then format it:
names(df)=sub("(\\d)(.)","\\2.\\1",names(df))
reshape(df,varying= -1,idvar = "item",direction = "long")

If we are using tidyverse, then gather into 'long' format, do some rearrangements with the column name and spread
library(tidyverse)
out <- df %>%
gather(key, val, -item) %>%
mutate(key1 = gsub("\\d+", "", key),
key2 = gsub("\\D+", "", key)) %>%
select(-key) %>%
spread(key1, val) %>%
select(-key2)
head(out, 2)
# item fielda fieldb fieldc
#1 a 2 57 138
#2 a 3 17 97
Or a similar option is melt/dcast from data.table, where we melt into 'long' format, substring the 'variable' and then dcast to 'wide' format
library(data.table)
dcast(melt(setDT(df), id.var = "item")[, variable := sub("\\d+", "", variable)
], item + rowid(variable) ~ variable, value.var = 'value')[
, variable := NULL][]
# item fielda fieldb fieldc
# 1: a 2 57 138
# 2: a 3 17 97
# 3: b 6 39 77
# 4: b 4 23 120
# 5: c 3 37 153
# 6: c 5 1 98
# 7: d 4 4 99
# 8: d 1 22 37
# 9: e 1 12 141
#10: e 2 49 163
#11: f 5 10 194
#12: f 6 19 131
NOTE: Should also work when the lengths are not balanced for each cases
data
set.seed(1)
df = data.frame(item = letters[1:6],
field1a=sample(6,6),
field1b=sample(60,6),
field1c=sample(200,6),
field2a=sample(6,6),
field2b=sample(60,6),
field2c=sample(200,6))

Related

How do I regroup data?

I am looking to change the structure of my dataframe, but I am not really sure how to do it. I am not even sure how to word the question either.
ID <- c(1,8,6,2,4)
a <- c(111,94,85,76,72)
b <- c(75,37,86,55,62)
dataframe <- data.frame(ID,a,b)
ID a b
1 1 111 75
2 8 94 37
3 6 85 86
4 2 76 55
5 4 72 62
Above is the code with the output, however, I want the output to look like the following; however, the only way I know how to do this is to just type it manually, is there any other way other than changing the input manually? Because I have quite a large data set that I would like to change and manually would just take forever.
ID letter value
1 1 a 111
2 1 b 75
3 8 a 94
4 8 b 37
5 6 a 85
6 6 b 86
7 2 a 76
8 2 b 55
9 4 a 72
10 4 b 62
We may use pivot_longer
library(dplyr)
library(tidyr)
dataframe %>%
pivot_longer(cols = a:b, names_to = 'letter')
-output
# A tibble: 10 × 3
ID letter value
<dbl> <chr> <dbl>
1 1 a 111
2 1 b 75
3 8 a 94
4 8 b 37
5 6 a 85
6 6 b 86
7 2 a 76
8 2 b 55
9 4 a 72
10 4 b 62
A base R option using reshape:
df <- reshape(dataframe, direction = "long",
v.names = "value",
varying = 2:3,
times = names(dataframe)[2:3],
timevar = "letter",
idvar = "ID")
df <- df[ order(match(df$ID, dataframe$ID)), ]
row.names(df) <- NULL
Output
ID letter value
1 1 a 111
2 1 b 75
3 8 a 94
4 8 b 37
5 6 a 85
6 6 b 86
7 2 a 76
8 2 b 55
9 4 a 72
10 4 b 62

Reshaping data frame in R: from wide to long, but the 'varying' columns have unequal length

My question is described in the code below. I have looked here and in other forums for similar problems, but haven't found a solution that quite matches what I'm asking here. If it can be solved relying only on basic R, that would be preferable, but using a package is fine too.
id1 <- c("A", "A", "A", "B", "B", "C", "C", "C")
id2 <- c(10, 20, 30, 10, 30, 10, 20, 30)
x.1 <- ceiling(runif(8)*80) + 20
y.1 <- ceiling(runif(8)*15) + 200
x.2 <- ceiling(runif(8)*90) + 20
y.2 <- ceiling(runif(8)*20) + 200
x.3 <- ceiling(runif(8)*80) + 40
# The data frame contains to kinds of data values, x and y, repeated by a suffix number. In my example both
# the id-part and the data-part are not structured in a completely uniform manner.
mywidedata <- data.frame(id1, id2, x.1, y.1, x.2, y.2, x.3)
# If I wanted to make the data frame even wider, this would work. It generates NAs for the missing combination (B,20).
reshape(mywidedata, idvar = "id1", timevar = "id2", direction = "wide")
# What I want is "long", and this fails.
reshape(mywidedata, varying = c(3:7), direction = "long")
# I could introduce the needed column. This works.
mywidecopy <- mywidedata
mywidecopy$y.3 <- NA
mylongdata <- reshape(mywidecopy, idvar=c(1,2), varying = c(3:8), direction = "long", sep = ".")
# (sep-argument not needed in this case - the function can figure out the system)
names(mylongdata)[(names(mylongdata)=="time")] <- "id3"
# I want to reach the same outcome without manual manipulation. Is it possible with the just the
# built-in 'reshape'?
# Trying 'melt'. Not what I want.
reshape::melt(mywidedata, id.vars = c(1,2))
You can use pivot_longer from tidyr :
tidyr::pivot_longer(mywidedata,
cols = -c(id1, id2),
names_to = c('.value', 'id3'),
names_sep = '\\.')
# A tibble: 24 x 5
# id1 id2 id3 x y
# <chr> <dbl> <chr> <dbl> <dbl>
# 1 A 10 1 66 208
# 2 A 10 2 95 220
# 3 A 10 3 89 NA
# 4 A 20 1 34 208
# 5 A 20 2 81 219
# 6 A 20 3 82 NA
# 7 A 30 1 23 201
# 8 A 30 2 80 204
# 9 A 30 3 75 NA
#10 B 10 1 52 210
# … with 14 more rows
Just cbind the missing level as NA.
reshape(cbind(mywidedata, y.2=NA), varying=3:8, direction="long")
# id1 id2 time x y id
# 1.1 A 10 1 98 215 1
# 2.1 A 20 1 38 208 2
# 3.1 A 30 1 97 205 3
# 4.1 B 10 1 61 207 4
# 5.1 B 30 1 73 201 5
# 6.1 C 10 1 96 202 6
# 7.1 C 20 1 100 202 7
# 8.1 C 30 1 94 202 8
# 1.2 A 10 2 73 208 1
# 2.2 A 20 2 69 218 2
# 3.2 A 30 2 64 219 3
# 4.2 B 10 2 104 213 4
# 5.2 B 30 2 99 203 5
# 6.2 C 10 2 92 206 6
# 7.2 C 20 2 49 206 7
# 8.2 C 30 2 59 209 8
# 1.3 A 10 3 63 208 1
# 2.3 A 20 3 91 218 2
# 3.3 A 30 3 42 219 3
# 4.3 B 10 3 67 213 4
# 5.3 B 30 3 90 203 5
# 6.3 C 10 3 74 206 6
# 7.3 C 20 3 86 206 7
# 8.3 C 30 3 83 209 8
We can use melt from data.table
library(data.table)
melt(setDT(mywidedata), measure = patterns("^x", "^y"), value.name = c('x', 'y'))
# id1 id2 variable x y
# 1: A 10 1 97 215
# 2: A 20 1 75 202
# 3: A 30 1 87 213
# 4: B 10 1 51 206
# 5: B 30 1 75 203
# 6: C 10 1 41 210
# 7: C 20 1 58 211
# 8: C 30 1 50 207
# 9: A 10 2 92 204
#10: A 20 2 60 207
#11: A 30 2 35 201
#12: B 10 2 83 202
#13: B 30 2 81 202
#14: C 10 2 55 216
#15: C 20 2 68 204
#16: C 30 2 70 218
#17: A 10 3 89 NA
#18: A 20 3 108 NA
#19: A 30 3 47 NA
#20: B 10 3 78 NA
#21: B 30 3 43 NA
#22: C 10 3 106 NA
#23: C 20 3 92 NA
#24: C 30 3 96 NA

Apply Lag function Dynamically on all columns

Hi I have data frame with 100 columns , i am trying to calculate current_value -lag(current_value)
I can able to calculate for single column , but unable run code on all available columns dynamically
Sample df
Class <- c("A","A","A","A","B","B","B","C","C","C","C","C","C")
A<-c(23,33,45,56,22,34,34,45,65,5,57,75,57)
D<-c(2,133,5,60,23,312,341,25,75,50,3,9,21)
M<-c(34,35,67,325,46,56,547,47,67,67,68,3,12)
df <- data.frame(Class,A,D,M)
I have tried with below code
df <- df %>% group_by(Class) %>%
mutate(A_lag =(A-lag(A)))
Help me to calculate all lad columns dynamically
Thanks in advance
Class <- c("A","A","A","A","B","B","B","C","C","C","C","C","C")
A<-c(23,33,45,56,22,34,34,45,65,5,57,75,57)
D<-c(2,133,5,60,23,312,341,25,75,50,3,9,21)
M<-c(34,35,67,325,46,56,547,47,67,67,68,3,12)
df <- data.frame(Class,A,D,M)
library(dplyr)
df %>%
group_by(Class) %>%
mutate_all(~.-lag(.)) %>%
ungroup()
# # A tibble: 13 x 4
# Class A D M
# <fct> <dbl> <dbl> <dbl>
# 1 A NA NA NA
# 2 A 10 131 1
# 3 A 12 -128 32
# 4 A 11 55 258
# 5 B NA NA NA
# 6 B 12 289 10
# 7 B 0 29 491
# 8 C NA NA NA
# 9 C 20 50 20
#10 C -60 -25 0
#11 C 52 -47 1
#12 C 18 6 -65
#13 C -18 12 9
or if you want to add new columns to the existing ones
df %>%
group_by(Class) %>%
mutate_all(funs(new = .-lag(.))) %>%
ungroup()
# # A tibble: 13 x 7
# Class A D M A_new D_new M_new
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 A 23 2 34 NA NA NA
# 2 A 33 133 35 10 131 1
# 3 A 45 5 67 12 -128 32
# 4 A 56 60 325 11 55 258
# 5 B 22 23 46 NA NA NA
# 6 B 34 312 56 12 289 10
# 7 B 34 341 547 0 29 491
# 8 C 45 25 47 NA NA NA
# 9 C 65 75 67 20 50 20
#10 C 5 50 67 -60 -25 0
#11 C 57 3 68 52 -47 1
#12 C 75 9 3 18 6 -65
#13 C 57 21 12 -18 12 9
Using data.table and calculating lag for only numeric variables:
library(data.table)
setDT(df)
df[,
lapply(.SD, function(x) x - shift(x)),
by = Class,
.SDcols = (sapply(df, is.numeric))
]
Class A D M
1: A NA NA NA
2: A 10 131 1
3: A 12 -128 32
4: A 11 55 258
5: B NA NA NA
6: B 12 289 10
7: B 0 29 491
8: C NA NA NA
9: C 20 50 20
10: C -60 -25 0
11: C 52 -47 1
12: C 18 6 -65
13: C -18 12 9

Calculate Ranks for Each Group, but counting tie's as 1

Following up from this post:
Calculate ranks for each group
df <- ddply(df, .(type), transform, pos = rank(x, ties.method = "min")-1)
Using the method described in the above post, when you you have multiple ties across the same TYPE, the ranking output (Pos) gets a little messy and hard to interpret, though technically still an accurate output.
For example:
library(plyr)
df <- data.frame(type = c(rep("a",11), rep("b",6), rep("c",2), rep("d", 6)),
x = c(50:53, rep(54, 3), 55:56, rep(57, 2), rep(51,3), rep(52,2), 56,
53, 57, rep(52, 2), 54, rep(58, 2), 70))
df<-ddply(df,.(type),transform, pos=rank(x,ties.method="min")-1)
Produces:
Type X Pos
a 50 0
a 51 1
a 52 2
a 53 3
a 54 4
a 54 4
a 54 4
a 55 7
a 56 8
a 57 9
a 57 9
b 51 0
b 51 0
b 51 0
b 52 3
b 52 3
b 56 5
c 53 0
c 57 1
d 52 0
d 52 0
d 54 2
d 58 3
d 58 3
d 70 5
The Pos relative ranking is correct (equal values are ranked the same, lower values ranked lower, and higher values ranked higher), but I have been trying to make the output look prettier. Any thoughts?
I'd like to get the output to look like this:
Type X Pos
a 50 1
a 51 2
a 52 3
a 53 4
a 54 5
a 54 5
a 54 5
a 55 6
a 56 7
a 57 8
a 57 8
b 51 1
b 51 1
b 51 1
b 52 2
b 52 2
b 56 3
c 53 1
c 57 2
d 52 1
d 52 1
d 54 2
d 58 3
d 58 3
d 70 4
This format, of course, assumes that the total number of records for each group doesn't matter. By taking away the "-1", we can remove the 0's, but that only solves one aspect. I've tried playing around with different equations and ties.method's, but to no avail.
Maybe the rank() function isn't what I should be using?
It seems you are looking for dense-rank:
as.data.table(df)[, pos := frank(x, ties.method = 'dense'), by = 'type'][]
# type x pos
# 1: a 50 1
# 2: a 51 2
# 3: a 52 3
# 4: a 53 4
# 5: a 54 5
# 6: a 54 5
# 7: a 54 5
# 8: a 55 6
# 9: a 56 7
# 10: a 57 8
# 11: a 57 8
# 12: b 51 1
# 13: b 51 1
# 14: b 51 1
# 15: b 52 2
# 16: b 52 2
# 17: b 56 3
# 18: c 53 1
# 19: c 57 2
# 20: d 52 1
# 21: d 52 1
# 22: d 54 2
# 23: d 58 3
# 24: d 58 3
# 25: d 70 4
# type x pos
dens_rank in dplyr does the same thing:
library(dplyr)
df %>% group_by(type) %>% mutate(pos = dense_rank(x)) %>% ungroup()
# # A tibble: 25 x 3
# type x pos
# <fctr> <dbl> <int>
# 1 a 50 1
# 2 a 51 2
# 3 a 52 3
# 4 a 53 4
# 5 a 54 5
# 6 a 54 5
# 7 a 54 5
# 8 a 55 6
# 9 a 56 7
# 10 a 57 8
# # ... with 15 more rows

reorder subset of data.table records

Let's say I have the following data.table:
set.seed(123)
dt <- data.table (id=1:10,
group=sample(LETTERS[1:3], 10, replace=TRUE),
val=sample(1:100, 10, replace=TRUE),
ltr=sample(letters, 10),
col5=sample(100:200, 10)
)
setkey(dt, id)
(dt)
# id group val ltr col5
# 1: 1 A 96 x 197
# 2: 2 C 46 r 190
# 3: 3 B 68 p 168
# 4: 4 C 58 w 177
# 5: 5 C 11 o 102
# 6: 6 A 90 v 145
# 7: 7 B 25 k 172
# 8: 8 C 5 l 120
# 9: 9 B 33 f 129
# 10: 10 B 96 c 121
now I want to process it with grouping by group, and in each group I would need to order records by val column and then do some manipulations within each ordered group (for example, add a column with values from ltr merged in order):
# id group val ltr letters
# 1 6 A 90 v v_x
# 2 1 A 96 x v_x
# 3 7 B 25 k k_f_p_c
# 4 9 B 33 f k_f_p_c
# 5 3 B 68 p k_f_p_c
# 6 10 B 96 c k_f_p_c
# 7 8 C 5 l l_o_r_w
# 8 5 C 11 o l_o_r_w
# 9 2 C 46 r l_o_r_w
# 10 4 C 58 w l_o_r_w
(in this example the whole table is ordered but this is not required)
That's how I imagine the code in general:
dt1 <- dt[,
{
# processing here, reorder somehow
# ???
# ...
list(id=id, ltr=ltr, letters=paste0(ltr,collapse="_"))
},
by=group]
Thanks in advance for any ideas!
UPD. As noted in answers, for my example I can simply order by group and then by val. And if I need to do several different orderings? For example, I want to sort by col5 and add col5diff column which will show the difference of col5 values:
# id group val ltr col5 letters col5diff
# 1: 6 A 90 v 145 v_x
# 2: 1 A 96 x 197 v_x 52
# 3: 10 B 96 c 121 k_f_p_c
# 4: 9 B 33 f 129 k_f_p_c 8
# 5: 3 B 68 p 168 k_f_p_c 47
# 6: 7 B 25 k 172 k_f_p_c 51
# 7: 5 C 11 o 102 l_o_r_w
# 8: 8 C 5 l 120 l_o_r_w 18
# 9: 4 C 58 w 177 l_o_r_w 75
#10: 2 C 46 r 190 l_o_r_w 88
ok, for this example calculations of letters and col5diff are independent, so I can simply do them consecutively:
setkey(dt, "group", "val")
dt[, letters := paste(ltr, collapse="_"), by = group]
setkey(dt, "group", "col5")
dt<-dt[, col5diff:={
diff <- NA;
for (i in 2:length(col5)) {diff <- c(diff, col5[i]-col5[1]);}
diff; # updated to use := instead of list - thanks to comment of #Frank
}, by = group]
but I would be also glad to know what to do if I would need to use both of these orderings (in single {} block).
I think you're just looking for order
dt[, letters:=paste(ltr[order(val)], collapse="_"), by=group]
dt[order(group, val)]
# id group val ltr col5 letters
# 1: 6 A 90 v 145 v_x
# 2: 1 A 96 x 197 v_x
# 3: 7 B 25 k 172 k_f_p_c
# 4: 9 B 33 f 129 k_f_p_c
# 5: 3 B 68 p 168 k_f_p_c
# 6: 10 B 96 c 121 k_f_p_c
# 7: 8 C 5 l 120 l_o_r_w
# 8: 5 C 11 o 102 l_o_r_w
# 9: 2 C 46 r 190 l_o_r_w
#10: 4 C 58 w 177 l_o_r_w
Or, if you do not want to add a column by reference:
dt[, list(id, val, ltr, letters=paste(ltr[order(val)], collapse="_")),
by=group][order(group, val)]
# group id val ltr letters
# 1: A 6 90 v v_x
# 2: A 1 96 x v_x
# 3: B 7 25 k k_f_p_c
# 4: B 9 33 f k_f_p_c
# 5: B 3 68 p k_f_p_c
# 6: B 10 96 c k_f_p_c
# 7: C 8 5 l l_o_r_w
# 8: C 5 11 o l_o_r_w
# 9: C 2 46 r l_o_r_w
#10: C 4 58 w l_o_r_w
Unless I'm missing something, this just requires setting the key of your data.table to group and val:
setkey(dt, "group", "val")
# id group val ltr col5
# 1: 6 A 90 v 145
# 2: 1 A 96 x 197
# 3: 7 B 25 k 172
# 4: 9 B 33 f 129
# 5: 3 B 68 p 168
# 6: 10 B 96 c 121
# 7: 8 C 5 l 120
# 8: 5 C 11 o 102
# 9: 2 C 46 r 190
# 10: 4 C 58 w 177
You see that the values are automatically ordered. Now you can subset by group:
dt[, letters := paste(ltr, collapse="_"), by = group]
# id group val ltr col5 letters
# 1: 6 A 90 v 145 v_x
# 2: 1 A 96 x 197 v_x
# 3: 7 B 25 k 172 k_f_p_c
# 4: 9 B 33 f 129 k_f_p_c
# 5: 3 B 68 p 168 k_f_p_c
# 6: 10 B 96 c 121 k_f_p_c
# 7: 8 C 5 l 120 l_o_r_w
# 8: 5 C 11 o 102 l_o_r_w
# 9: 2 C 46 r 190 l_o_r_w
# 10: 4 C 58 w 177 l_o_r_w

Resources