Reshape: “Error: index out of bounds” - r

I have a quite big dataframe with the following structure:
image coef v3 v4 v5 v6 ... v20
1 A 0 1 2 3
1 B 2 4 6 5
1 C 1 2 4 7
1 D 4 5 6 4
2 A 2 3 4 5
2 B 2 3 4 5
2 C 2 3 4 5
2 D 2 3 4 5
And I need to end up with "flattened" structure on the coef variable for each image index. Now each image have the variables with the shape [4:20] but i need it to be [1:80] with the patern [A,B,C,D,A',B',C',D'...].
like this:
image v3 v4 v5 v6 v7 v8 v9 v10 ... v80
1 0 2 1 4 1 4 2 5
2 2 2 2 2 3 3 3 3
I tried to do:
reshape(df, timevar = "coef", idvar = "image", direction = "wide")
But i gives me the Error :
Error in data[, timevar] : subindex out of bounds
Also I tried the library Reshape2 with:
dcast(df, image~coef, value.var= )
but since I have more than one value.var column I cannot figure out how to do it.

We can melt and then do the dcast
library(data.table)
dM <- melt(setDT(df1), id.var=c("image", "coef"))
dcast(dM, image~variable+coef, value.var="value")
Or use recast (which is a wrapper for melt/dcast) from reshape2
library(reshape2)
recast(df1, id.var=c("image", "coef"),image~variable+coef, value.var="value")
# image v3_A v3_B v3_C v3_D v4_A v4_B v4_C v4_D v5_A v5_B v5_C v5_D v6_A v6_B v6_C v6_D
#1 1 0 2 1 4 1 4 2 5 2 6 4 6 3 5 7 4
#2 2 2 2 2 2 3 3 3 3 4 4 4 4 5 5 5 5
data
df1 <- structure(list(image = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L),
coef = c("A",
"B", "C", "D", "A", "B", "C", "D"), v3 = c(0L, 2L, 1L, 4L, 2L,
2L, 2L, 2L), v4 = c(1L, 4L, 2L, 5L, 3L, 3L, 3L, 3L), v5 = c(2L,
6L, 4L, 6L, 4L, 4L, 4L, 4L), v6 = c(3L, 5L, 7L, 4L, 5L, 5L, 5L,
5L)), .Names = c("image", "coef", "v3", "v4", "v5", "v6"),
class = "data.frame", row.names = c(NA, -8L))

Related

Remove duplicate rows in R and add entries of removed rows to kept row

I have data like this:
df <-
a b c
1 2 3
1 2 4
1 2 5
1 2 9
2 3 3
2 3 4
2 3 5
2 3 9
3 4 3
3 4 4
3 4 5
3 4 9
I want to remove duplicate rows base on column a but keep the values in column c as in:
df2 <-
a b c c1 c2 c3
1 2 3 4 5 9
2 3 3 4 5 9
3 4 3 4 5 9
I know how to remove duplicates as in :
df2 <-df[!(df$a=="1"),]
But have now idea how to add the values to the kept row.
We can exclude c while subsettting the rows of the dataset, unlist, and then concatenate with the whole 'c' column
c(unlist(df[!duplicated(df$a), 1:2]), c = df$c)
# a b c1 c2 c3 c4
# 1 2 3 4 5 9
If we need the same names as in the expected
c(unlist(df[!duplicated(df$a), 1:2]),
setNames(df$c, make.unique(rep('c', nrow(df)), sep="")))
# a b c c1 c2 c3
# 1 2 3 4 5 9
With the new example
library(dplyr)
library(tidyr)
df2 %>%
group_by(a) %>%
summarise(b = first(b), c = list(as.list(c))) %>%
unnest_wider(c(c))%>%
rename_at(vars(starts_with('.')), ~ str_c('c', seq_along(.)))
# A tibble: 2 x 6
# a b c1 c2 c3 c4
# <int> <int> <int> <int> <int> <int>
#1 1 2 3 4 5 9
#2 2 2 3 4 5 9
Or with again updated example
df3 %>%
group_by(a) %>%
summarise(b = first(b), c = list(as.list(c))) %>%
unnest_wider(c(c))%>%
rename_at(vars(starts_with('.')), ~ str_c('c', seq_along(.)))
# A tibble: 3 x 6
# a b c1 c2 c3 c4
# <int> <int> <int> <int> <int> <int>
#1 1 2 3 4 5 9
#2 2 3 3 4 5 9
#3 3 4 3 4 5 9
Or with data.table
library(data.table)
setDT(df3)[, c(.(b = first(b)),
as.data.frame.list(setNames(c, rep('c', .N)))), a]
# a b c c.1 c.2 c.3
#1: 1 2 3 4 5 9
#2: 2 3 3 4 5 9
#3: 3 4 3 4 5 9
data
df <- structure(list(a = c(1L, 1L, 1L, 1L), b = c(2L, 3L, 3L, 4L),
c = c(3L, 4L, 5L, 9L)), class = "data.frame", row.names = c(NA,
-4L))
df2 <- structure(list(a = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), b = c(2L,
3L, 3L, 4L, 2L, 3L, 3L, 4L), c = c(3L, 4L, 5L, 9L, 3L, 4L, 5L,
9L)), class = "data.frame", row.names = c(NA, -8L))
df3 <- structure(list(a = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L,
3L, 3L), b = c(2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L
), c = c(3L, 4L, 5L, 9L, 3L, 4L, 5L, 9L, 3L, 4L, 5L, 9L)), class = "data.frame", row.names = c(NA,
-12L))

Finding difference between specific rows by group

Within a group, I want to find the difference between that row and the first time that user appeared in the data. For example, I need to create the diff variable below. Users have different number of rows each as in the following data:
df <- structure(list(ID = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 4L, 4L),
money = c(9L, 12L, 13L, 15L, 5L, 7L, 8L, 5L, 2L, 10L), occurence = c(1L,
2L, 3L, 4L, 1L, 2L, 3L, 1L, 1L, 2L), diff = c(NA, 3L, 4L,
6L, NA, 2L, 3L, NA, NA, 8L)), .Names = c("ID", "money", "occurence",
"diff"), class = "data.frame", row.names = c(NA, -10L))
ID money occurence diff
1 1 9 1 NA
2 1 12 2 3
3 1 13 3 4
4 1 15 4 6
5 2 5 1 NA
6 2 7 2 2
7 2 8 3 3
8 3 5 1 NA
9 4 2 1 NA
10 4 10 2 8
You can use ave(). We just remove the first value per group and replace it with NA, and subtract the first value from the rest of the values.
with(df, ave(money, ID, FUN = function(x) c(NA, x[-1] - x[1])))
# [1] NA 3 4 6 NA 2 3 NA NA 8
A dplyr solution, which uses the first function to get the first value and calculate the difference.
library(dplyr)
df2 <- df %>%
group_by(ID) %>%
mutate(diff = money - first(money)) %>%
mutate(diff = replace(diff, diff == 0, NA)) %>%
ungroup()
df2
# # A tibble: 10 x 4
# ID money occurence diff
# <int> <int> <int> <int>
# 1 1 9 1 NA
# 2 1 12 2 3
# 3 1 13 3 4
# 4 1 15 4 6
# 5 2 5 1 NA
# 6 2 7 2 2
# 7 2 8 3 3
# 8 3 5 1 NA
# 9 4 2 1 NA
# 10 4 10 2 8
Update
Here is a data.table solution provided by Sotos. Notice that no need to replace 0 with NA.
library(data.table)
setDT(df)[, money := money - first(money), by = ID][]
# ID money occurence diff
# 1: 1 0 1 NA
# 2: 1 3 2 3
# 3: 1 4 3 4
# 4: 1 6 4 6
# 5: 2 0 1 NA
# 6: 2 2 2 2
# 7: 2 3 3 3
# 8: 3 0 1 NA
# 9: 4 0 1 NA
# 10: 4 8 2 8
DATA
dput(df)
structure(list(ID = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 4L, 4L),
money = c(9L, 12L, 13L, 15L, 5L, 7L, 8L, 5L, 2L, 10L), occurence = c(1L,
2L, 3L, 4L, 1L, 2L, 3L, 1L, 1L, 2L)), .Names = c("ID", "money",
"occurence"), row.names = c(NA, -10L), class = "data.frame")

Tidy data.frame with repeated column names

I have a program that gives me data in this format
toy
file_path Condition Trial.Num A B C ID A B C ID A B C ID
1 root/some.extension Baseline 1 2 3 5 car 2 1 7 bike 4 9 0 plane
2 root/thing.extension Baseline 2 3 6 45 car 5 4 4 bike 9 5 4 plane
3 root/else.extension Baseline 3 4 4 6 car 7 5 4 bike 68 7 56 plane
4 root/uniquely.extension Treatment 1 5 3 7 car 1 7 37 bike 9 8 7 plane
5 root/defined.extension Treatment 2 6 7 3 car 4 6 8 bike 9 0 8 plane
My goal is to tidy the format into something that at least can be easier to finally tidy with reshape having unique column names
tidy_toy
file_path Condition Trial.Num A B C ID
1 root/some.extension Baseline 1 2 3 5 car
2 root/thing.extension Baseline 2 3 6 45 car
3 root/else.extension Baseline 3 4 4 6 car
4 root/uniquely.extension Treatment 1 5 3 7 car
5 root/defined.extension Treatment 2 6 7 3 car
6 root/some.extension Baseline 1 2 1 7 bike
7 root/thing.extension Baseline 2 5 4 4 bike
8 root/else.extension Baseline 3 7 5 4 bike
9 root/uniquely.extension Treatment 1 1 7 37 bike
10 root/defined.extension Treatment 2 4 6 8 bike
11 root/some.extension Baseline 1 4 9 0 plane
12 root/thing.extension Baseline 2 9 5 4 plane
13 root/else.extension Baseline 3 68 7 56 plane
14 root/uniquely.extension Treatment 1 9 8 7 plane
15 root/defined.extension Treatment 2 9 0 8 plane
If I try to melt from toy it doesn't work because only the first ID column will get used for id.vars (hence everything will get tagged as cars). Identical variables will get dropped.
Here's the dput of both tables
structure(list(file_path = structure(c(3L, 4L, 2L, 5L, 1L), .Label = c("root/defined.extension",
"root/else.extension", "root/some.extension", "root/thing.extension",
"root/uniquely.extension"), class = "factor"), Condition = structure(c(1L,
1L, 1L, 2L, 2L), .Label = c("Baseline", "Treatment"), class = "factor"),
Trial.Num = c(1L, 2L, 3L, 1L, 2L), A = 2:6, B = c(3L, 6L,
4L, 3L, 7L), C = c(5L, 45L, 6L, 7L, 3L), ID = structure(c(1L,
1L, 1L, 1L, 1L), .Label = "car", class = "factor"), A = c(2L,
5L, 7L, 1L, 4L), B = c(1L, 4L, 5L, 7L, 6L), C = c(7L, 4L,
4L, 37L, 8L), ID = structure(c(1L, 1L, 1L, 1L, 1L), .Label = "bike", class = "factor"),
A = c(4L, 9L, 68L, 9L, 9L), B = c(9L, 5L, 7L, 8L, 0L), C = c(0L,
4L, 56L, 7L, 8L), ID = structure(c(1L, 1L, 1L, 1L, 1L), .Label = "plane", class = "factor")), .Names = c("file_path",
"Condition", "Trial.Num", "A", "B", "C", "ID", "A", "B", "C",
"ID", "A", "B", "C", "ID"), class = "data.frame", row.names = c(NA,
-5L))
structure(list(file_path = structure(c(3L, 4L, 2L, 5L, 1L, 3L,
4L, 2L, 5L, 1L, 3L, 4L, 2L, 5L, 1L), .Label = c("root/defined.extension",
"root/else.extension", "root/some.extension", "root/thing.extension",
"root/uniquely.extension"), class = "factor"), Condition = structure(c(1L,
1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L), .Label = c("Baseline",
"Treatment"), class = "factor"), Trial.Num = c(1L, 2L, 3L, 1L,
2L, 1L, 2L, 3L, 1L, 2L, 1L, 2L, 3L, 1L, 2L), A = c(2L, 3L, 4L,
5L, 6L, 2L, 5L, 7L, 1L, 4L, 4L, 9L, 68L, 9L, 9L), B = c(3L, 6L,
4L, 3L, 7L, 1L, 4L, 5L, 7L, 6L, 9L, 5L, 7L, 8L, 0L), C = c(5L,
45L, 6L, 7L, 3L, 7L, 4L, 4L, 37L, 8L, 0L, 4L, 56L, 7L, 8L), ID = structure(c(2L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L), .Label = c("bike",
"car", "plane"), class = "factor")), .Names = c("file_path",
"Condition", "Trial.Num", "A", "B", "C", "ID"), class = "data.frame", row.names = c(NA,
-15L))
You can use the make.unique-function to create unique column names. After that you can use melt from the data.table-package which is able to create multiple value-columns based on patterns in the columnnames:
# make the column names unique
names(toy) <- make.unique(names(toy))
# let the 'Condition' column start with a small letter 'c'
# so it won't be detected by the patterns argument from melt
names(toy)[2] <- tolower(names(toy)[2])
# load the 'data.table' package
library(data.table)
# tidy the data into long format
tidy_toy <- melt(setDT(toy),
measure.vars = patterns('^A','^B','^C','^ID'),
value.name = c('A','B','C','ID'))
which gives:
> tidy_toy
file_path condition Trial.Num variable A B C ID
1: root/some.extension Baseline 1 1 2 3 5 car
2: root/thing.extension Baseline 2 1 3 6 45 car
3: root/else.extension Baseline 3 1 4 4 6 car
4: root/uniquely.extension Treatment 1 1 5 3 7 car
5: root/defined.extension Treatment 2 1 6 7 3 car
6: root/some.extension Baseline 1 2 2 1 7 bike
7: root/thing.extension Baseline 2 2 5 4 4 bike
8: root/else.extension Baseline 3 2 7 5 4 bike
9: root/uniquely.extension Treatment 1 2 1 7 37 bike
10: root/defined.extension Treatment 2 2 4 6 8 bike
11: root/some.extension Baseline 1 3 4 9 0 plane
12: root/thing.extension Baseline 2 3 9 5 4 plane
13: root/else.extension Baseline 3 3 68 7 56 plane
14: root/uniquely.extension Treatment 1 3 9 8 7 plane
15: root/defined.extension Treatment 2 3 9 0 8 plane
Another option is to use a list of column-indexes for measure.vars:
tidy_toy <- melt(setDT(toy),
measure.vars = list(c(4,8,12), c(5,9,13), c(6,10,14), c(7,11,15)),
value.name = c('A','B','C','ID'))
Making the column-names unique isn't necessary then.
A more complicated method that creates names that are better distinguishable by the patterns argument:
# select the names that are not unique
tt <- table(names(toy))
idx <- which(names(toy) %in% names(tt)[tt > 1])
nms <- names(toy)[idx]
# make them unique
names(toy)[idx] <- paste(nms,
rep(seq(length(nms) / length(names(tt)[tt > 1])),
each = length(names(tt)[tt > 1])),
sep = '.')
# your columnnames are now unique:
> names(toy)
[1] "file_path" "Condition" "Trial.Num" "A.1" "B.1" "C.1" "ID.1" "A.2"
[9] "B.2" "C.2" "ID.2" "A.3" "B.3" "C.3" "ID.3"
# tidy the data into long format
tidy_toy <- melt(setDT(toy),
measure.vars = patterns('^A.\\d','^B.\\d','^C.\\d','^ID.\\d'),
value.name = c('A','B','C','ID'))
which will give the same end-result.
As mentioned in the comments, the janitor-package can be helpful for this problem as well. The clean_names() works similar as the make.unique function. See here for an explanation.
with tidyverse we can do :
library(tidyverse)
toy %>%
repair_names(sep="_") %>%
pivot_longer(-(1:3),names_to = c(".value","id"), names_sep="_") %>%
select(-id)
#> # A tibble: 15 x 7
#> file_path Condition Trial.Num A B C ID
#> <fct> <fct> <int> <int> <int> <int> <fct>
#> 1 root/some.extension Baseline 1 2 3 5 car
#> 2 root/some.extension Baseline 1 2 1 7 bike
#> 3 root/some.extension Baseline 1 4 9 0 plane
#> 4 root/thing.extension Baseline 2 3 6 45 car
#> 5 root/thing.extension Baseline 2 5 4 4 bike
#> 6 root/thing.extension Baseline 2 9 5 4 plane
#> 7 root/else.extension Baseline 3 4 4 6 car
#> 8 root/else.extension Baseline 3 7 5 4 bike
#> 9 root/else.extension Baseline 3 68 7 56 plane
#> 10 root/uniquely.extension Treatment 1 5 3 7 car
#> 11 root/uniquely.extension Treatment 1 1 7 37 bike
#> 12 root/uniquely.extension Treatment 1 9 8 7 plane
#> 13 root/defined.extension Treatment 2 6 7 3 car
#> 14 root/defined.extension Treatment 2 4 6 8 bike
#> 15 root/defined.extension Treatment 2 9 0 8 plane
#> Warning message:
#> Expected 2 pieces. Missing pieces filled with `NA` in 4 rows [1, 2, 3, 4].

Remove duplicated 2 columns permutations

I can't find a good title for this question so feel free to edit it please.
I have this data.frame
section time to from
1 a 9 1 2
2 a 9 2 1
3 a 12 2 3
4 a 12 2 4
5 a 12 3 2
6 a 12 3 4
7 a 12 4 2
8 a 12 4 3
I want to remove duplicated rows that have the same to and from simultaneously, without computing permutations of the 2 columns: e.g (1,2) and (2,1) are duplicated.
So final output would be:
section time to from
1 a 9 1 2
3 a 12 2 3
4 a 12 2 4
6 a 12 3 4
I have a solution by constructing a new column key e.g
key <- paste(min(to,from),max(to,from))
and remove duplicated key using duplicated, but I think this is dirty solution.
here the dput of my data
structure(list(section = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = "a", class = "factor"), time = c(9L, 9L, 12L,
12L, 12L, 12L, 12L, 12L), to = c(1L, 2L, 2L, 2L, 3L, 3L, 4L,
4L), from = c(2L, 1L, 3L, 4L, 2L, 4L, 2L, 3L)), .Names = c("section",
"time", "to", "from"), row.names = c(NA, -8L), class = "data.frame")
mn <- pmin(s$to, s$from)
mx <- pmax(s$to, s$from)
int <- as.numeric(interaction(mn, mx))
s[match(unique(int), int),]
section time to from
1 a 9 1 2
3 a 12 2 3
4 a 12 2 4
6 a 12 3 4
Credit for the idea goes to this question: Remove consecutive duplicates from dataframe and specifically #MatthewPlourde's answer.
You can try using sort within the apply function to order the combinations.
mydf[!duplicated(t(apply(mydf[3:4], 1, sort))), ]
# section time to from
# 1 a 9 1 2
# 3 a 12 2 3
# 4 a 12 2 4
# 6 a 12 3 4

change data frame in R

i have a data frame generated inside a for loop and have this structure
V1 V2 V3
1 a a 1
2 a b 3
3 a c 2
4 a d 1
5 a e 3
6 b a 3
7 b b 1
8 b c 8
9 b d 1
10 b e 1
11 c a 2
12 c b 8
the data is longer than this , but that's the idea that i want
(transform it to a wide table [V1 by V2])
V3 is a value based on (V1, V2)
i want to rearrange data to be like this (with first col is the unique of V1 and first row is the unique of V2 and data between them are from V3 )
a b c d e
a 1 3 2 1 3
b 3 1 8 1 1
c 2 8 2 8 2
d 1 1 5 7 2
e 3 5 9 5 3
thnx in advance.
Reproducible example of yours:
df <- structure(list(V1 = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L), .Label = c("a", "b", "c"), class = "factor"), V2 = structure(c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L), .Label = c("a", "b", "c", "d", "e"), class = "factor"), V3 = c(1L, 3L, 2L, 1L, 3L, 3L, 1L, 8L, 1L, 1L, 2L, 8L)), .Names = c("V1", "V2", "V3"), class = "data.frame", row.names = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12"))
And compute a basic crosstable based on your variables:
> xtabs(V3~V1+V2, df)
V2
V1 a b c d e
a 1 3 2 1 3
b 3 1 8 1 1
c 2 8 0 0 0
I hope you meant this :)
If df is your data-frame, assuming a unique V3 is mapped to each V1,V2 combination, you can do it with
with(df, tapply(V3, list(V1,V2), identity))
Another method, perhaps slightly more baroque, for widening a dataframe from a third column on the basis of the first two... with Chase that the OP has not given an unambiguous problem description:
df2 <- expand.grid(A=LETTERS[1:5], B=LETTERS[1:5])
df2$N <- 1:25
mtx <- outer(X=LETTERS[1:5],Y=LETTERS[1:5], FUN=function(x,y){
df2[intersect(which(df2$A==x), which(df2$B==y)), "N"] })
colnames(mtx)<-LETTERS[1:5]; rownames(mtx)<-LETTERS[1:5]
mtx
A B C D E
A 1 6 11 16 21
B 2 7 12 17 22
C 3 8 13 18 23
D 4 9 14 19 24
E 5 10 15 20 25
I'm sure there are many other strategies using reshape in base or dcast in reshape2.

Resources