merge by id and column name in R - r

I am trying to merge two data set into one using id and column name as indices.
I have the following data
df <-
a b c d e f g id
1 1 1 1 1 1 1 1
2 2 2 2 2 2 2 2
3 3 3 3 3 3 3 3
4 4 4 4 4 4 4 4
panel_empty <-
id df_id df_data df1_data df2_data df3_data
1 a
1 b
1 c
1 d
1 e
1 f
1 g
2 a
2 b
2 c
2 d
2 e
2 f
2 g
3 a
3 b
3 c
3 d
3 e
3 f
3 g
4 a
4 b
4 c
4 d
4 e
4 f
4 g
I would like to merge these somehow to look like this
panel_full <-
id df_id df_data df2_data df3_data
1 a 1
1 b 1
1 c 1
1 d 1
1 e 1
1 f 1
1 g 1
2 a 2
2 b 2
2 c 2
2 d 2
2 e 2
2 f 2
2 g 2
3 a 3
3 b 3
3 c 3
3 d 3
3 e 3
3 f 3
3 g 3
4 a 4
4 b 4
4 c 4
4 d 4
4 e 4
4 f 4
4 g 4
I only know how to merge by id but have no idea how to merge by id and column name. For panel data data this is quite important to do and I was surprised not find any similar problem on this site.
EDIT:
So far, I was able to convert from wide to long
long <- melt(df, id.vars = c("id"))
However, I do not know to move on.
I tried
m1 <- merge(panel_emtpy, long, by.x = "id", by.y = "df_id")

Here's a way with dplyr and tidyr::gather() -
panel_full %>%
left_join(gather(df, df_id, df_data, -id), by = c("id", "df_id"))

Related

How to create a group based on pattern from another column?

I have a data frame as below,
dt <- data.frame(id = c("a","b","c","d","e","f","g","h","i","j"),
value = c(1,2,1,2,1,1,1,2,1,2))
> dt
id value
1 a 1
2 b 2
3 c 1
4 d 2
5 e 1
6 f 1
7 g 1
8 h 2
9 i 1
10 j 2
I hope to create a column based on column value so that whenever it runs into a 2 in column value it will assign a new group number. The output will look like,
dtgroup <- data.frame(id = c("a","b","c","d","e","f","g","h","i","j"),
value = c(1,2,1,2,1,1,1,2,1,2),
group = c(1,1,2,2,3,3,3,3,4,4))
> dtgroup
id value group
1 a 1 1
2 b 2 1
3 c 1 2
4 d 2 2
5 e 1 3
6 f 1 3
7 g 1 3
8 h 2 3
9 i 1 4
10 j 2 4
Any ideas? Thanks!
We can use findInterval like below
> transform(dt, group = 1 + findInterval(seq_along(value), which(value == 2), left.open = TRUE))
id value group
1 a 1 1
2 b 2 1
3 c 1 2
4 d 2 2
5 e 1 3
6 f 1 3
7 g 1 3
8 h 2 3
9 i 1 4
10 j 2 4
or cut
> transform(dt, group = as.integer(cut(seq_along(value), c(-Inf, which(value == 2)))))
id value group
1 a 1 1
2 b 2 1
3 c 1 2
4 d 2 2
5 e 1 3
6 f 1 3
7 g 1 3
8 h 2 3
9 i 1 4
10 j 2 4
Another possibility. Increment by one when value is 1 and the previous value (dplyr::lag) is not 1.
dt$group <- with(dt, cumsum(value == 1 & dplyr::lag(value != 1, default = 1)))
id value group
1 a 1 1
2 b 2 1
3 c 1 2
4 d 2 2
5 e 1 3
6 f 1 3
7 g 1 3
8 h 2 3
9 i 1 4
10 j 2 4
With cumsum, if value doesn't have NAs:
dt$group <- head(c(0,cumsum(dt$value==2))+1,-1)
dt
id value group
1 a 1 1
2 b 2 1
3 c 1 2
4 d 2 2
5 e 1 3
6 f 1 3
7 g 1 3
8 h 2 3
9 i 1 4
10 j 2 4

Is there an R function to merge two data frames based on two columns separately matching to the same column?

I would like to two populate values ("VAL") based on one of two columns separately("VALA","VALB").
# Data
DF1 <- data.frame("colA" = rep(c("A","B"), 6),
"colB" = rep(c("C","D","E"), 4))
DF2 <- data.frame("colC" = c("A","B","C","D","E"),
"VAL" = 1:5)
# three join calls
tmp1 <- left_join(DF1, DF2, by=c("colA"="colC"))
names(tmp1)[3] <- "VALA"
tmp2 <- left_join(DF1, DF2, by=c("colB"="colC"))
names(tmp2)[3] <- "VALB"
left_join(tmp1, tmp2, by=c("colA", "colB"))
# colA colB VALA VALB
# 1 A C 1 3
# 2 A C 1 3
# 3 B D 2 4
# 4 B D 2 4
# 5 A E 1 5
# 6 A E 1 5
# 7 B C 2 3
# 8 B C 2 3
# 9 A D 1 4
# 10 A D 1 4
# 11 B E 2 5
# 12 B E 2 5
# 13 A C 1 3
# 14 A C 1 3
# 15 B D 2 4
# 16 B D 2 4
# 17 A E 1 5
# 18 A E 1 5
# 19 B C 2 3
# 20 B C 2 3
# 21 A D 1 4
# 22 A D 1 4
# 23 B E 2 5
# 24 B E 2 5
Why does the last operation give 24 rows as output instead of expected 12?
Is there any possibility to achieve the same expected out in the most elegant way(instead of 3 join operations)?
You can use match to find the corresponding value and cbind the resluting columns.
cbind(DF1, VALA=DF2$VAL[match(DF1$colA, DF2$colC)],
VALB=DF2$VAL[match(DF1$colB, DF2$colC)])
colA colB VALA VALB
#1 A C 1 3
#2 B D 2 4
#3 A E 1 5
#4 B C 2 3
#5 A D 1 4
#6 B E 2 5
#7 A C 1 3
#8 B D 2 4
#9 A E 1 5
#10 B C 2 3
#11 A D 1 4
#12 B E 2 5
or use names:
x <- setNames(DF2$VAL, DF2$colC)
cbind(DF1, VALA=x[DF1$colA], VALB=x[DF1$colB])
and in case for many columns using match inside lapply
cbind(DF1, setNames(lapply(DF1, function(x) DF2$VAL[match(x, DF2$colC)]),
sub("col", "VAL", names(DF1))))
# colA colB VALA VALB
#1 A C 1 3
#2 B D 2 4
#3 A E 1 5
#4 B C 2 3
#5 A D 1 4
#6 B E 2 5
#7 A C 1 3
#8 B D 2 4
#9 A E 1 5
#10 B C 2 3
#11 A D 1 4
#12 B E 2 5
Try to combine left_join after one another using %>% and define its suffixes.
DF1 <- DF1 %>%
left_join(DF2, c("colA" = "colC")) %>%
left_join(DF2, c("colB" = "colC"),
suffix = c ("A", "B"))
> DF1
colA colB VALA VALB
1 A C 1 3
2 B D 2 4
3 A E 1 5
4 B C 2 3
5 A D 1 4
6 B E 2 5
7 A C 1 3
8 B D 2 4
9 A E 1 5
10 B C 2 3
11 A D 1 4
12 B E 2 5

How do I group_by if the column that I want to summarize with has all the same values

x l
1 1 a
2 3 b
3 2 c
4 3 b
5 2 c
6 4 d
7 5 f
8 2 c
9 1 a
10 1 a
11 3 b
12 4 d
The above is the input.
The below is the output.
x l
1 1 a
2 3 b
3 2 c
4 4 d
5 5 f
I know that column l will have the same value for each group_by(x).
l is a string
# Creation of dataset
x <- c(1,3,2,3,2,4,5,2,1,1,3,4)
l<- c("a","b","c","b","c","d","f","c","a","a","b","d")
df <- data.frame(x,l)
# Simply call unique function on your dataframe
dfu <- unique(df)

Join two dataframe

I have to collect values from one dataframe and place in another. I have tried to use merge function but that mess up order in second dataframe.
This is how my data looks like.
> df<-as.data.frame(cbind(letters[1:4],1:4))
> df
V1 V2
1 a 1
2 b 2
3 c 3
4 d 4
> dflist <- data.frame("home"= sample(df[,1],15, replace = TRUE))
>
> dflist$away <-sample(df[,1],15, replace = TRUE)
> dflist
home away
1 a b
2 a a
3 d c
4 d a
5 c c
6 a c
7 b d
8 b b
9 a b
10 b d
11 b a
12 a a
13 a c
14 c b
15 d a
Desired result should look like this.
home away value1 value2
1 a b 1 2
2 a a 1 1
3 d c 4 3
4 d a 4 1
5 c c 3 3
.
Outcome table will be lose its order if I use merge here.
You could try this:
dflist[c("value1", "value2")] <- t(apply(dflist, 1, function(x)
c(df[match(x[1], df$V1),2], df[match(x[2], df$V1),2])))
dflist
home away value1 value2
1 a b 1 2
2 a a 1 1
3 d c 4 3
4 d a 4 1
5 c c 3 3
6 a c 1 3
7 b d 2 4
8 b b 2 2
9 a b 1 2
10 b d 2 4
11 b a 2 1
12 a a 1 1
13 a c 1 3
14 c b 3 2
15 d a 4 1

Counting how many times an element occurs in the column of a data.frame

Let's say I have a data.frame with a factor.
d = data.frame(f = c("a","a","a","b","b","b","b","d","d"))
f
1 a
2 a
3 a
4 b
5 b
6 b
7 b
8 d
9 d
And I want to add a column telling me how many times an element occurs.
Like this
f n
1 a 3
2 a 3
3 a 3
4 b 4
5 b 4
6 b 4
7 b 4
8 d 2
9 d 2
How would I do this?
Can also use some plyr functions - join & ddply
d <- data.frame(f = c("a","a","a","b","b","b","b","d","d"))
d2 <- join(d, ddply(d, .(f), 'nrow'))
d2
f nrow
1 a 3
2 a 3
3 a 3
4 b 4
5 b 4
6 b 4
7 b 4
8 d 2
9 d 2
You can use table like this:
d$n <- table(d$f)[d$f]
# f n
#1 a 3
#2 a 3
#3 a 3
#4 b 4
#5 b 4
#6 b 4
#7 b 4
#8 d 2
#9 d 2
You can use ave and length:
> d$n <- as.numeric(ave(as.character(d$f), d$f, FUN = length))
> d
f n
1 a 3
2 a 3
3 a 3
4 b 4
5 b 4
6 b 4
7 b 4
8 d 2
9 d 2
With the "data.table" package, you might do something like:
library(data.table)
D <- data.table(d)
D[, n := as.numeric(.N), by = f]

Resources