Related
I have data like this:
df <-
a b c
1 2 3
1 2 4
1 2 5
1 2 9
2 3 3
2 3 4
2 3 5
2 3 9
3 4 3
3 4 4
3 4 5
3 4 9
I want to remove duplicate rows base on column a but keep the values in column c as in:
df2 <-
a b c c1 c2 c3
1 2 3 4 5 9
2 3 3 4 5 9
3 4 3 4 5 9
I know how to remove duplicates as in :
df2 <-df[!(df$a=="1"),]
But have now idea how to add the values to the kept row.
We can exclude c while subsettting the rows of the dataset, unlist, and then concatenate with the whole 'c' column
c(unlist(df[!duplicated(df$a), 1:2]), c = df$c)
# a b c1 c2 c3 c4
# 1 2 3 4 5 9
If we need the same names as in the expected
c(unlist(df[!duplicated(df$a), 1:2]),
setNames(df$c, make.unique(rep('c', nrow(df)), sep="")))
# a b c c1 c2 c3
# 1 2 3 4 5 9
With the new example
library(dplyr)
library(tidyr)
df2 %>%
group_by(a) %>%
summarise(b = first(b), c = list(as.list(c))) %>%
unnest_wider(c(c))%>%
rename_at(vars(starts_with('.')), ~ str_c('c', seq_along(.)))
# A tibble: 2 x 6
# a b c1 c2 c3 c4
# <int> <int> <int> <int> <int> <int>
#1 1 2 3 4 5 9
#2 2 2 3 4 5 9
Or with again updated example
df3 %>%
group_by(a) %>%
summarise(b = first(b), c = list(as.list(c))) %>%
unnest_wider(c(c))%>%
rename_at(vars(starts_with('.')), ~ str_c('c', seq_along(.)))
# A tibble: 3 x 6
# a b c1 c2 c3 c4
# <int> <int> <int> <int> <int> <int>
#1 1 2 3 4 5 9
#2 2 3 3 4 5 9
#3 3 4 3 4 5 9
Or with data.table
library(data.table)
setDT(df3)[, c(.(b = first(b)),
as.data.frame.list(setNames(c, rep('c', .N)))), a]
# a b c c.1 c.2 c.3
#1: 1 2 3 4 5 9
#2: 2 3 3 4 5 9
#3: 3 4 3 4 5 9
data
df <- structure(list(a = c(1L, 1L, 1L, 1L), b = c(2L, 3L, 3L, 4L),
c = c(3L, 4L, 5L, 9L)), class = "data.frame", row.names = c(NA,
-4L))
df2 <- structure(list(a = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), b = c(2L,
3L, 3L, 4L, 2L, 3L, 3L, 4L), c = c(3L, 4L, 5L, 9L, 3L, 4L, 5L,
9L)), class = "data.frame", row.names = c(NA, -8L))
df3 <- structure(list(a = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L,
3L, 3L), b = c(2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L
), c = c(3L, 4L, 5L, 9L, 3L, 4L, 5L, 9L, 3L, 4L, 5L, 9L)), class = "data.frame", row.names = c(NA,
-12L))
Within a group, I want to find the difference between that row and the first time that user appeared in the data. For example, I need to create the diff variable below. Users have different number of rows each as in the following data:
df <- structure(list(ID = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 4L, 4L),
money = c(9L, 12L, 13L, 15L, 5L, 7L, 8L, 5L, 2L, 10L), occurence = c(1L,
2L, 3L, 4L, 1L, 2L, 3L, 1L, 1L, 2L), diff = c(NA, 3L, 4L,
6L, NA, 2L, 3L, NA, NA, 8L)), .Names = c("ID", "money", "occurence",
"diff"), class = "data.frame", row.names = c(NA, -10L))
ID money occurence diff
1 1 9 1 NA
2 1 12 2 3
3 1 13 3 4
4 1 15 4 6
5 2 5 1 NA
6 2 7 2 2
7 2 8 3 3
8 3 5 1 NA
9 4 2 1 NA
10 4 10 2 8
You can use ave(). We just remove the first value per group and replace it with NA, and subtract the first value from the rest of the values.
with(df, ave(money, ID, FUN = function(x) c(NA, x[-1] - x[1])))
# [1] NA 3 4 6 NA 2 3 NA NA 8
A dplyr solution, which uses the first function to get the first value and calculate the difference.
library(dplyr)
df2 <- df %>%
group_by(ID) %>%
mutate(diff = money - first(money)) %>%
mutate(diff = replace(diff, diff == 0, NA)) %>%
ungroup()
df2
# # A tibble: 10 x 4
# ID money occurence diff
# <int> <int> <int> <int>
# 1 1 9 1 NA
# 2 1 12 2 3
# 3 1 13 3 4
# 4 1 15 4 6
# 5 2 5 1 NA
# 6 2 7 2 2
# 7 2 8 3 3
# 8 3 5 1 NA
# 9 4 2 1 NA
# 10 4 10 2 8
Update
Here is a data.table solution provided by Sotos. Notice that no need to replace 0 with NA.
library(data.table)
setDT(df)[, money := money - first(money), by = ID][]
# ID money occurence diff
# 1: 1 0 1 NA
# 2: 1 3 2 3
# 3: 1 4 3 4
# 4: 1 6 4 6
# 5: 2 0 1 NA
# 6: 2 2 2 2
# 7: 2 3 3 3
# 8: 3 0 1 NA
# 9: 4 0 1 NA
# 10: 4 8 2 8
DATA
dput(df)
structure(list(ID = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 4L, 4L),
money = c(9L, 12L, 13L, 15L, 5L, 7L, 8L, 5L, 2L, 10L), occurence = c(1L,
2L, 3L, 4L, 1L, 2L, 3L, 1L, 1L, 2L)), .Names = c("ID", "money",
"occurence"), row.names = c(NA, -10L), class = "data.frame")
I have a program that gives me data in this format
toy
file_path Condition Trial.Num A B C ID A B C ID A B C ID
1 root/some.extension Baseline 1 2 3 5 car 2 1 7 bike 4 9 0 plane
2 root/thing.extension Baseline 2 3 6 45 car 5 4 4 bike 9 5 4 plane
3 root/else.extension Baseline 3 4 4 6 car 7 5 4 bike 68 7 56 plane
4 root/uniquely.extension Treatment 1 5 3 7 car 1 7 37 bike 9 8 7 plane
5 root/defined.extension Treatment 2 6 7 3 car 4 6 8 bike 9 0 8 plane
My goal is to tidy the format into something that at least can be easier to finally tidy with reshape having unique column names
tidy_toy
file_path Condition Trial.Num A B C ID
1 root/some.extension Baseline 1 2 3 5 car
2 root/thing.extension Baseline 2 3 6 45 car
3 root/else.extension Baseline 3 4 4 6 car
4 root/uniquely.extension Treatment 1 5 3 7 car
5 root/defined.extension Treatment 2 6 7 3 car
6 root/some.extension Baseline 1 2 1 7 bike
7 root/thing.extension Baseline 2 5 4 4 bike
8 root/else.extension Baseline 3 7 5 4 bike
9 root/uniquely.extension Treatment 1 1 7 37 bike
10 root/defined.extension Treatment 2 4 6 8 bike
11 root/some.extension Baseline 1 4 9 0 plane
12 root/thing.extension Baseline 2 9 5 4 plane
13 root/else.extension Baseline 3 68 7 56 plane
14 root/uniquely.extension Treatment 1 9 8 7 plane
15 root/defined.extension Treatment 2 9 0 8 plane
If I try to melt from toy it doesn't work because only the first ID column will get used for id.vars (hence everything will get tagged as cars). Identical variables will get dropped.
Here's the dput of both tables
structure(list(file_path = structure(c(3L, 4L, 2L, 5L, 1L), .Label = c("root/defined.extension",
"root/else.extension", "root/some.extension", "root/thing.extension",
"root/uniquely.extension"), class = "factor"), Condition = structure(c(1L,
1L, 1L, 2L, 2L), .Label = c("Baseline", "Treatment"), class = "factor"),
Trial.Num = c(1L, 2L, 3L, 1L, 2L), A = 2:6, B = c(3L, 6L,
4L, 3L, 7L), C = c(5L, 45L, 6L, 7L, 3L), ID = structure(c(1L,
1L, 1L, 1L, 1L), .Label = "car", class = "factor"), A = c(2L,
5L, 7L, 1L, 4L), B = c(1L, 4L, 5L, 7L, 6L), C = c(7L, 4L,
4L, 37L, 8L), ID = structure(c(1L, 1L, 1L, 1L, 1L), .Label = "bike", class = "factor"),
A = c(4L, 9L, 68L, 9L, 9L), B = c(9L, 5L, 7L, 8L, 0L), C = c(0L,
4L, 56L, 7L, 8L), ID = structure(c(1L, 1L, 1L, 1L, 1L), .Label = "plane", class = "factor")), .Names = c("file_path",
"Condition", "Trial.Num", "A", "B", "C", "ID", "A", "B", "C",
"ID", "A", "B", "C", "ID"), class = "data.frame", row.names = c(NA,
-5L))
structure(list(file_path = structure(c(3L, 4L, 2L, 5L, 1L, 3L,
4L, 2L, 5L, 1L, 3L, 4L, 2L, 5L, 1L), .Label = c("root/defined.extension",
"root/else.extension", "root/some.extension", "root/thing.extension",
"root/uniquely.extension"), class = "factor"), Condition = structure(c(1L,
1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L), .Label = c("Baseline",
"Treatment"), class = "factor"), Trial.Num = c(1L, 2L, 3L, 1L,
2L, 1L, 2L, 3L, 1L, 2L, 1L, 2L, 3L, 1L, 2L), A = c(2L, 3L, 4L,
5L, 6L, 2L, 5L, 7L, 1L, 4L, 4L, 9L, 68L, 9L, 9L), B = c(3L, 6L,
4L, 3L, 7L, 1L, 4L, 5L, 7L, 6L, 9L, 5L, 7L, 8L, 0L), C = c(5L,
45L, 6L, 7L, 3L, 7L, 4L, 4L, 37L, 8L, 0L, 4L, 56L, 7L, 8L), ID = structure(c(2L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L), .Label = c("bike",
"car", "plane"), class = "factor")), .Names = c("file_path",
"Condition", "Trial.Num", "A", "B", "C", "ID"), class = "data.frame", row.names = c(NA,
-15L))
You can use the make.unique-function to create unique column names. After that you can use melt from the data.table-package which is able to create multiple value-columns based on patterns in the columnnames:
# make the column names unique
names(toy) <- make.unique(names(toy))
# let the 'Condition' column start with a small letter 'c'
# so it won't be detected by the patterns argument from melt
names(toy)[2] <- tolower(names(toy)[2])
# load the 'data.table' package
library(data.table)
# tidy the data into long format
tidy_toy <- melt(setDT(toy),
measure.vars = patterns('^A','^B','^C','^ID'),
value.name = c('A','B','C','ID'))
which gives:
> tidy_toy
file_path condition Trial.Num variable A B C ID
1: root/some.extension Baseline 1 1 2 3 5 car
2: root/thing.extension Baseline 2 1 3 6 45 car
3: root/else.extension Baseline 3 1 4 4 6 car
4: root/uniquely.extension Treatment 1 1 5 3 7 car
5: root/defined.extension Treatment 2 1 6 7 3 car
6: root/some.extension Baseline 1 2 2 1 7 bike
7: root/thing.extension Baseline 2 2 5 4 4 bike
8: root/else.extension Baseline 3 2 7 5 4 bike
9: root/uniquely.extension Treatment 1 2 1 7 37 bike
10: root/defined.extension Treatment 2 2 4 6 8 bike
11: root/some.extension Baseline 1 3 4 9 0 plane
12: root/thing.extension Baseline 2 3 9 5 4 plane
13: root/else.extension Baseline 3 3 68 7 56 plane
14: root/uniquely.extension Treatment 1 3 9 8 7 plane
15: root/defined.extension Treatment 2 3 9 0 8 plane
Another option is to use a list of column-indexes for measure.vars:
tidy_toy <- melt(setDT(toy),
measure.vars = list(c(4,8,12), c(5,9,13), c(6,10,14), c(7,11,15)),
value.name = c('A','B','C','ID'))
Making the column-names unique isn't necessary then.
A more complicated method that creates names that are better distinguishable by the patterns argument:
# select the names that are not unique
tt <- table(names(toy))
idx <- which(names(toy) %in% names(tt)[tt > 1])
nms <- names(toy)[idx]
# make them unique
names(toy)[idx] <- paste(nms,
rep(seq(length(nms) / length(names(tt)[tt > 1])),
each = length(names(tt)[tt > 1])),
sep = '.')
# your columnnames are now unique:
> names(toy)
[1] "file_path" "Condition" "Trial.Num" "A.1" "B.1" "C.1" "ID.1" "A.2"
[9] "B.2" "C.2" "ID.2" "A.3" "B.3" "C.3" "ID.3"
# tidy the data into long format
tidy_toy <- melt(setDT(toy),
measure.vars = patterns('^A.\\d','^B.\\d','^C.\\d','^ID.\\d'),
value.name = c('A','B','C','ID'))
which will give the same end-result.
As mentioned in the comments, the janitor-package can be helpful for this problem as well. The clean_names() works similar as the make.unique function. See here for an explanation.
with tidyverse we can do :
library(tidyverse)
toy %>%
repair_names(sep="_") %>%
pivot_longer(-(1:3),names_to = c(".value","id"), names_sep="_") %>%
select(-id)
#> # A tibble: 15 x 7
#> file_path Condition Trial.Num A B C ID
#> <fct> <fct> <int> <int> <int> <int> <fct>
#> 1 root/some.extension Baseline 1 2 3 5 car
#> 2 root/some.extension Baseline 1 2 1 7 bike
#> 3 root/some.extension Baseline 1 4 9 0 plane
#> 4 root/thing.extension Baseline 2 3 6 45 car
#> 5 root/thing.extension Baseline 2 5 4 4 bike
#> 6 root/thing.extension Baseline 2 9 5 4 plane
#> 7 root/else.extension Baseline 3 4 4 6 car
#> 8 root/else.extension Baseline 3 7 5 4 bike
#> 9 root/else.extension Baseline 3 68 7 56 plane
#> 10 root/uniquely.extension Treatment 1 5 3 7 car
#> 11 root/uniquely.extension Treatment 1 1 7 37 bike
#> 12 root/uniquely.extension Treatment 1 9 8 7 plane
#> 13 root/defined.extension Treatment 2 6 7 3 car
#> 14 root/defined.extension Treatment 2 4 6 8 bike
#> 15 root/defined.extension Treatment 2 9 0 8 plane
#> Warning message:
#> Expected 2 pieces. Missing pieces filled with `NA` in 4 rows [1, 2, 3, 4].
I can't find a good title for this question so feel free to edit it please.
I have this data.frame
section time to from
1 a 9 1 2
2 a 9 2 1
3 a 12 2 3
4 a 12 2 4
5 a 12 3 2
6 a 12 3 4
7 a 12 4 2
8 a 12 4 3
I want to remove duplicated rows that have the same to and from simultaneously, without computing permutations of the 2 columns: e.g (1,2) and (2,1) are duplicated.
So final output would be:
section time to from
1 a 9 1 2
3 a 12 2 3
4 a 12 2 4
6 a 12 3 4
I have a solution by constructing a new column key e.g
key <- paste(min(to,from),max(to,from))
and remove duplicated key using duplicated, but I think this is dirty solution.
here the dput of my data
structure(list(section = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = "a", class = "factor"), time = c(9L, 9L, 12L,
12L, 12L, 12L, 12L, 12L), to = c(1L, 2L, 2L, 2L, 3L, 3L, 4L,
4L), from = c(2L, 1L, 3L, 4L, 2L, 4L, 2L, 3L)), .Names = c("section",
"time", "to", "from"), row.names = c(NA, -8L), class = "data.frame")
mn <- pmin(s$to, s$from)
mx <- pmax(s$to, s$from)
int <- as.numeric(interaction(mn, mx))
s[match(unique(int), int),]
section time to from
1 a 9 1 2
3 a 12 2 3
4 a 12 2 4
6 a 12 3 4
Credit for the idea goes to this question: Remove consecutive duplicates from dataframe and specifically #MatthewPlourde's answer.
You can try using sort within the apply function to order the combinations.
mydf[!duplicated(t(apply(mydf[3:4], 1, sort))), ]
# section time to from
# 1 a 9 1 2
# 3 a 12 2 3
# 4 a 12 2 4
# 6 a 12 3 4
i have a data frame generated inside a for loop and have this structure
V1 V2 V3
1 a a 1
2 a b 3
3 a c 2
4 a d 1
5 a e 3
6 b a 3
7 b b 1
8 b c 8
9 b d 1
10 b e 1
11 c a 2
12 c b 8
the data is longer than this , but that's the idea that i want
(transform it to a wide table [V1 by V2])
V3 is a value based on (V1, V2)
i want to rearrange data to be like this (with first col is the unique of V1 and first row is the unique of V2 and data between them are from V3 )
a b c d e
a 1 3 2 1 3
b 3 1 8 1 1
c 2 8 2 8 2
d 1 1 5 7 2
e 3 5 9 5 3
thnx in advance.
Reproducible example of yours:
df <- structure(list(V1 = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L), .Label = c("a", "b", "c"), class = "factor"), V2 = structure(c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L), .Label = c("a", "b", "c", "d", "e"), class = "factor"), V3 = c(1L, 3L, 2L, 1L, 3L, 3L, 1L, 8L, 1L, 1L, 2L, 8L)), .Names = c("V1", "V2", "V3"), class = "data.frame", row.names = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12"))
And compute a basic crosstable based on your variables:
> xtabs(V3~V1+V2, df)
V2
V1 a b c d e
a 1 3 2 1 3
b 3 1 8 1 1
c 2 8 0 0 0
I hope you meant this :)
If df is your data-frame, assuming a unique V3 is mapped to each V1,V2 combination, you can do it with
with(df, tapply(V3, list(V1,V2), identity))
Another method, perhaps slightly more baroque, for widening a dataframe from a third column on the basis of the first two... with Chase that the OP has not given an unambiguous problem description:
df2 <- expand.grid(A=LETTERS[1:5], B=LETTERS[1:5])
df2$N <- 1:25
mtx <- outer(X=LETTERS[1:5],Y=LETTERS[1:5], FUN=function(x,y){
df2[intersect(which(df2$A==x), which(df2$B==y)), "N"] })
colnames(mtx)<-LETTERS[1:5]; rownames(mtx)<-LETTERS[1:5]
mtx
A B C D E
A 1 6 11 16 21
B 2 7 12 17 22
C 3 8 13 18 23
D 4 9 14 19 24
E 5 10 15 20 25
I'm sure there are many other strategies using reshape in base or dcast in reshape2.