How can I use merge so that I have data for all times? - r

I'm trying to change a data into which all entities have value for all possible times(months). Here's what I'm trying;
Class Value month
A 10 1
A 12 3
A 9 12
B 11 1
B 10 8
From the data above, I want to get the following data;
Class Value month
A 10 1
A NA 2
A 12 3
A NA 4
....
A 9 12
B 11 1
B NA 2
....
B 10 8
B NA 9
....
B NA 12
So I want to have all possible cells with through month from 1 to 12;
How can I do this? I'm right now trying it with merge function, but appreciate any other ways to approach.

We can use tidyverse
library(tidyverse)
df1 %>%
complete(Class, month = min(month):max(month)) %>%
select_(.dots = names(df1)) %>% #if we need to be in the same column order
as.data.frame() #if needed to convert to 'data.frame'

In base R using merge (where df is your data):
res <- data.frame(Class=rep(levels(df$Class), each=12), value=NA, month=1:12)
merge(df, res, by = c("Class", "month"), all.y = TRUE)[,c(1,3,2)]
# Class Value month
# 1 A 10 1
# 2 A NA 2
# 3 A 12 3
# 4 A NA 4
# 5 A NA 5
# 6 A NA 6
# 7 A NA 7
# 8 A NA 8
# 9 A NA 9
# 10 A NA 10
# 11 A NA 11
# 12 A 9 12
# 13 B 11 1
# 14 B NA 2
# 15 B NA 3
# 16 B NA 4
# 17 B NA 5
# 18 B NA 6
# 19 B NA 7
# 20 B 10 8
# 21 B NA 9
# 22 B NA 10
# 23 B NA 11
# 24 B NA 12
df <- structure(list(Class = structure(c(1L, 1L, 1L, 2L, 2L), .Label = c("A",
"B"), class = "factor"), Value = c(10L, 12L, 9L, 11L, 10L), month = c(1L,
3L, 12L, 1L, 8L)), .Names = c("Class", "Value", "month"), class = "data.frame", row.names = c(NA,
-5L))

To add to #akrun's answer, if you want to replace the NA values with 0, you can do the following:
library(dplyr)
library(tidyr)
df1 %>%
complete(Class, month = min(month):max(month)) %>%
mutate(Value = ifelse(is.na(Value),0,Value))

Related

R Overwrite column values with non NA values from column in separate dataframe

I have a dataframe 'df1' with a lot of columns, but the ones of interest are:
Number
Code
1
2
3
10
11
AMRO
4
277
2100
BLPH
And I have another dataframe 'df2' with a lot of columns, but the ones of interest are:
Number
Code
1
AMCR
2
AMCR
3
BANO
10
BAEA
12
AMRO
4
NA
277
NA
2100
NA
I want matching values in the 'Number' columns of 'df1' and 'df2' to lead to values in the 'Code' column in 'df2' to overwrite the 'Code' values in 'df1' as long as the 'Code' values in 'df2' don't contain an NA, so that the final result of 'df1' looks like:
Number
Code
1
AMCR
2
AMCR
3
BANO
10
BAEA
11
AMRO
4
277
2100
BLPH
Thank you for your help!
We can do
library(powerjoin)
power_left_join(df1, df2, by = "Number", conflict = coalesce)
-output
Number Code
1 1 AMCR
2 2 AMCR
3 3 BANO
4 10 BAEA
5 11 AMRO
6 4 <NA>
7 277 <NA>
8 2100 BLPH
Or to do an overwrite, use data.table
library(data.table)
setDT(df1)[df2, Code := fcoalesce(Code, i.Code), on = .(Number)]
-output
> df1
Number Code
<int> <char>
1: 1 AMCR
2: 2 AMCR
3: 3 BANO
4: 10 BAEA
5: 11 AMRO
6: 4 <NA>
7: 277 <NA>
8: 2100 BLPH
data
df1 <- structure(list(Number = c(1L, 2L, 3L, 10L, 11L, 4L, 277L, 2100L
), Code = c(NA, NA, NA, NA, "AMRO", NA, NA, "BLPH")),
class = "data.frame", row.names = c(NA,
-8L))
df2 <- structure(list(Number = c(1L, 2L, 3L, 10L, 12L, 4L, 277L, 2100L
), Code = c("AMCR", "AMCR", "BANO", "BAEA", "AMRO", NA, NA, NA
)), class = "data.frame", row.names = c(NA, -8L))
Here is an alternative approach using bind_cols:
library(dplyr)
bind_cols(df1, df2) %>%
mutate(Code = coalesce(Code...2, Code...4)) %>%
select(Number = Number...1, Code)
Number Code
1 1 AMCR
2 2 AMCR
3 3 BANO
4 10 BAEA
5 11 AMRO
6 4 <NA>
7 277 <NA>
8 2100 BLPH
Here is a solution playing with dplyr full_join and inner_join
library(dplyr)
df1 %>%
full_join(df2) %>% na.omit() %>%
full_join(df1 %>% inner_join(df2)) %>%
filter(Number %in% df1$Number) %>%
arrange(Number)
Output
#> Number Code
#> 1 1 AMCR
#> 2 2 AMCR
#> 3 3 BANO
#> 4 4 <NA>
#> 5 10 BAEA
#> 6 11 AMRO
#> 7 277 <NA>
#> 8 2100 BLPH

How to find rows with same values in two columns?

It's a little hard to explain, but I'm trying to compare the column "cpf" from two different data frames. I want to identify when the value in the two "cpf" columns from (df1) and (df2) is equal (these values can be in different rows). After that, I want to update the NA values if these are available from the other data frame
df1
cpf x y
1 21 NA NA
2 32 NA NA
3 43 NA NA
4 54 NA NA
5 65 NA NA
df2
cpf x y
1 54 5 10
2 0 NA NA
3 65 3 2
4 0 NA NA
5 0 NA NA
I want the following result
df3
cpf x y
1 21 NA NA
2 32 NA NA
3 43 NA NA
4 54 5 10
5 65 3 2
We could do a join on 'cpf' and use fcoalecse
library(data.table)
setDT(df1)[df2, c('x', 'y') := .(fcoalesce(x, i.x),
fcoalesce(y, i.y)), on = .(cpf)]
-output
df1
# cpf x y
#1: 21 NA NA
#2: 32 NA NA
#3: 43 NA NA
#4: 54 5 10
#5: 65 3 2
Or using coalecse from dplyr after a left_join
library(dplyr)
left_join(df1, df2, by = 'cpf') %>%
transmute(cpf, x = coalesce(x.x, x.y), y = coalesce(y.x, y.y))
# cpf x y
#1 21 NA NA
#2 32 NA NA
#3 43 NA NA
#4 54 5 10
#5 65 3 2
In base R, can use match
i1 <- match(df1$cpf, df2$cpf, nomatch = 0)
i2 <- match(df2$cpf, df1$cpf, nomatch = 0)
df1[i2, -1] <- df2[i1, -1]
data
df1 <- structure(list(cpf = c(21L, 32L, 43L, 54L, 65L), x = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_), y = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_)), row.names = c("1",
"2", "3", "4", "5"), class = "data.frame")
df2 <- structure(list(cpf = c(54L, 0L, 65L, 0L, 0L), x = c(5L, NA, 3L,
NA, NA), y = c(10L, NA, 2L, NA, NA)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5"))
df1 %>%
left_join(df2, by = "cpf") %>%
select(cpf, x = x.y, y = y.y)
Output:
cpf x y
1 21 NA NA
2 32 NA NA
3 43 NA NA
4 54 5 10
5 65 3 2
Another base R option using merge
merge(df1,
df2,
by = "cpf",
all.x = TRUE,
suffixes = c(".x", "")
)[names(df1)]
gives
cpf x y
1 21 NA NA
2 32 NA NA
3 43 NA NA
4 54 5 10
5 65 3 2

Getting rows in data frame based on mutiple ranges in R

Let's say I have this table
a b
1 5 12
2 6 17
3 7 28
4 8 12
5 9 17
6 10 28
7 15 12
8 25 14
9 13 29
Also another table with index ranges:
start end
1 2 3
2 5 7
I want to get the rows in the first table based on the index ranges in the second table with a group name to differentiate, something like this:
a b group
2 6 17 1
3 7 28 1
5 9 17 2
6 10 28 2
7 15 12 2
how do I achieve this in R?
We can subset df1 taking the row index value from df2 using Map.
do.call(rbind, Map(function(x, y, z) transform(df1[x:y, ], group = z),
df2$start, df2$end, seq_len(nrow(df2))))
Or in purrr :
purrr::map2_dfr(df2$start, df2$end, ~df1[.x:.y, ], .id = "group")
# group a b
#1 1 6 17
#2 1 7 28
#3 2 9 17
#4 2 10 28
#5 2 15 12
data
df1 <- structure(list(a = c(5L, 6L, 7L, 8L, 9L, 10L, 15L, 25L, 13L),
b = c(12L, 17L, 28L, 12L, 17L, 28L, 12L, 14L, 29L)),
class = "data.frame", row.names = c(NA, -9L))
df2 <- structure(list(start = c(2L, 5L), end = c(3L, 7L)),
class = "data.frame", row.names = c(NA, -2L))
An option using data.table:
DT1[, rn := .I]
DT2[, g := .I]
DT1[, g := DT2[DT1, on=.(start<=rn, end>=rn), x.g]][
!is.na(g)]
output:
a b rn g
1: 6 17 2 1
2: 7 28 3 1
3: 9 17 5 2
4: 10 28 6 2
5: 15 12 7 2
data:
library(data.table)
DT1 <- fread("a b
5 12
6 17
7 28
8 12
9 17
10 28
15 12
25 14
13 29")
DT2 <- fread("start end
2 3
5 7")

Add rows to a data-frame based on values in one of the columns

Currently the data-frame looks something like this:
Scenario Month A B C
1 1 -0.593186301 1.045550808 -0.593816304
1 2 0.178626141 2.043084432 0.111370583
1 3 1.205779717 -0.324083723 -1.397716949
2 1 0.933615199 0.052647056 -0.656486153
2 2 1.647291688 -1.065793671 0.799040546
2 3 1.613663101 -1.955567231 -1.817457972
3 1 -0.621991775 1.634069402 -1.404981646
3 2 -1.899326887 -0.836322394 -1.826351541
3 3 0.164235141 -1.160701812 1.238246459
I'd like to add rows on top of the row where Month = 1 as below. I know dplyr has an add_rows function but I'd like to add rows based on a condition. Any help is hugely appreciated.
Scenario Month A B C
0
1 1 -0.593186301 1.045550808 -0.593816304
1 2 0.178626141 2.043084432 0.111370583
1 3 1.205779717 -0.324083723 -1.397716949
0
2 1 0.933615199 0.052647056 -0.656486153
2 2 1.647291688 -1.065793671 0.799040546
2 3 1.613663101 -1.955567231 -1.817457972
0
3 1 -0.621991775 1.634069402 -1.404981646
3 2 -1.899326887 -0.836322394 -1.826351541
3 3 0.164235141 -1.160701812 1.238246459
A solution using tidyverse.
library(tidyverse)
dat2 <- dat %>%
split(f = .$Scenario) %>%
map_dfr(~bind_rows(tibble(Scenario = 0), .x))
dat2
# # A tibble: 12 x 5
# Scenario Month A B C
# <dbl> <int> <dbl> <dbl> <dbl>
# 1 0 NA NA NA NA
# 2 1 1 -0.593 1.05 -0.594
# 3 1 2 0.179 2.04 0.111
# 4 1 3 1.21 -0.324 -1.40
# 5 0 NA NA NA NA
# 6 2 1 0.934 0.0526 -0.656
# 7 2 2 1.65 -1.07 0.799
# 8 2 3 1.61 -1.96 -1.82
# 9 0 NA NA NA NA
# 10 3 1 -0.622 1.63 -1.40
# 11 3 2 -1.90 -0.836 -1.83
# 12 3 3 0.164 -1.16 1.24
DATA
dat <- read.table(text = "Scenario Month A B C
1 1 -0.593186301 1.045550808 -0.593816304
1 2 0.178626141 2.043084432 0.111370583
1 3 1.205779717 -0.324083723 -1.397716949
2 1 0.933615199 0.052647056 -0.656486153
2 2 1.647291688 -1.065793671 0.799040546
2 3 1.613663101 -1.955567231 -1.817457972
3 1 -0.621991775 1.634069402 -1.404981646
3 2 -1.899326887 -0.836322394 -1.826351541
3 3 0.164235141 -1.160701812 1.238246459 ",
header = TRUE)
Somehow add_row doesn't take multiple values to its .before parameter.
One way is to split the dataframe wherever Month = 1 and then for each dataframe add a row using add_row above Month = 1.
library(tidyverse)
map_df(split(df, cumsum(df$Month == 1)),
~ add_row(., Scenario = 0, .before = which(.$Month == 1)))
# Scenario Month A B C
#1 0 NA NA NA NA
#2 1 1 -0.5931863 1.04555081 -0.5938163
#3 1 2 0.1786261 2.04308443 0.1113706
#4 1 3 1.2057797 -0.32408372 -1.3977169
#5 0 NA NA NA NA
#6 2 1 0.9336152 0.05264706 -0.6564862
#7 2 2 1.6472917 -1.06579367 0.7990405
#8 2 3 1.6136631 -1.95556723 -1.8174580
#9 0 NA NA NA NA
#10 3 1 -0.6219918 1.63406940 -1.4049816
#11 3 2 -1.8993269 -0.83632239 -1.8263515
#12 3 3 0.1642351 -1.16070181 1.2382465
Here is one option with data.table
library(data.table)
setDT(df1)[, .SD[c(.N+1, seq_len(.N))], Scenario][
!duplicated(Scenario), Scenario := 0][]
# Scenario Month A B C
# 1: 0 NA NA NA NA
# 2: 1 1 -0.5931863 1.04555081 -0.5938163
# 3: 1 2 0.1786261 2.04308443 0.1113706
# 4: 1 3 1.2057797 -0.32408372 -1.3977169
# 5: 0 NA NA NA NA
# 6: 2 1 0.9336152 0.05264706 -0.6564862
# 7: 2 2 1.6472917 -1.06579367 0.7990405
# 8: 2 3 1.6136631 -1.95556723 -1.8174580
# 9: 0 NA NA NA NA
#10: 3 1 -0.6219918 1.63406940 -1.4049816
#11: 3 2 -1.8993269 -0.83632239 -1.8263515
#12: 3 3 0.1642351 -1.16070181 1.2382465
Or as #chinsoon12 mentioned in the comments
setDT(df1)[, rbindlist(.(.(Scenario=0L), c(.(Scenario=rep(Scenario, .N)),
.SD)), use.names=TRUE, fill=TRUE), by=.(Scenario)][, -1L]
data
df1 <- structure(list(Scenario = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L
), Month = c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), A = c(-0.593186301,
0.178626141, 1.205779717, 0.933615199, 1.647291688, 1.613663101,
-0.621991775, -1.899326887, 0.164235141), B = c(1.045550808,
2.043084432, -0.324083723, 0.052647056, -1.065793671, -1.955567231,
1.634069402, -0.836322394, -1.160701812), C = c(-0.593816304,
0.111370583, -1.397716949, -0.656486153, 0.799040546, -1.817457972,
-1.404981646, -1.826351541, 1.238246459)), class = "data.frame",
row.names = c(NA,
-9L))
Here's a simple way (without loops) using base R -
df1 <- df[rep(1:nrow(df), (df$Month == 1)+1), ]
df1[duplicated(df1, fromLast = T), ] <- NA
df1$Scenario[is.na(df1$Scenario)] <- 0
df1
Scenario Month A B C
1 0 NA NA NA NA
1.1 1 1 -0.5931863 1.04555081 -0.5938163
2 1 2 0.1786261 2.04308443 0.1113706
3 1 3 1.2057797 -0.32408372 -1.3977169
4 0 NA NA NA NA
4.1 2 1 0.9336152 0.05264706 -0.6564862
5 2 2 1.6472917 -1.06579367 0.7990405
6 2 3 1.6136631 -1.95556723 -1.8174580
7 0 NA NA NA NA
7.1 3 1 -0.6219918 1.63406940 -1.4049816
8 3 2 -1.8993269 -0.83632239 -1.8263515
9 3 3 0.1642351 -1.16070181 1.2382465
Data -
df <- structure(list(Scenario = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L
), Month = c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), A = c(-0.593186301,
0.178626141, 1.205779717, 0.933615199, 1.647291688, 1.613663101,
-0.621991775, -1.899326887, 0.164235141), B = c(1.045550808,
2.043084432, -0.324083723, 0.052647056, -1.065793671, -1.955567231,
1.634069402, -0.836322394, -1.160701812), C = c(-0.593816304,
0.111370583, -1.397716949, -0.656486153, 0.799040546, -1.817457972,
-1.404981646, -1.826351541, 1.238246459)), class = "data.frame", row.names = c(NA,
-9L))

r - data frame manipulation [duplicate]

This question already has answers here:
Reshape multiple value columns to wide format
(5 answers)
Closed 5 years ago.
Suppose I have this data frame:
df <- data.frame(ID = c("id1", "id1", "id1", "id2", "id2", "id3", "id3", "id3"),
Code = c("A", "B", "C", "A", "B", "A", "C", "D"),
Count = c(34,65,21,3,8,12,15,16), Value = c(3,1,8,2,3,3,5,8))
that looks like this:
df
ID Code Count Value
1 id1 A 34 3
2 id1 B 65 1
3 id1 C 21 8
4 id2 A 3 2
5 id2 B 8 3
6 id3 A 12 3
7 id3 C 15 5
8 id3 D 16 8
I would like to obtain this result data frame:
result <- data.frame(Code = c("A", "B", "C", "D"),
id1_count = c(34,65,21,NA), id1_value = c(3,1,8,NA),
id2_count = c(3, 8, NA, NA), id2_value = c(2, 3, NA, NA),
id3_count = c(12,NA,15,16), id3_value = c(3,NA,5,8))
that looks like this:
> result
Code id1_count id1_value id2_count id2_value id3_count id3_value
1 A 34 3 3 2 12 3
2 B 65 1 8 3 NA NA
3 C 21 8 NA NA 15 5
4 D NA NA NA NA 16 8
Is there a one liner in the R base package that can do that? I am able to achieve the result I need but not in the R way (i.e., with loops and so on). Any help is appreciated. Thank you.
You can try dcast from devel version of data.table (v1.9.5) which can take multiple value.var columns. Instructions to install are here
library(data.table)
dcast(setDT(df), Code~ID, value.var=c('Count', 'Value'))
# Code Count_id1 Count_id2 Count_id3 Value_id1 Value_id2 Value_id3
#1: A 34 3 12 3 2 3
#2: B 65 8 NA 1 3 NA
#3: C 21 NA 15 8 NA 5
#4: D NA NA 16 NA NA 8
Or using reshape from base R
reshape(df, idvar='Code', timevar='ID', direction='wide')
# Code Count.id1 Value.id1 Count.id2 Value.id2 Count.id3 Value.id3
#1 A 34 3 3 2 12 3
#2 B 65 1 8 3 NA NA
#3 C 21 8 NA NA 15 5
#8 D NA NA NA NA 16 8
You could also try:
library(tidyr)
library(dplyr)
df %>%
gather(key, value, -(ID:Code)) %>%
unite(id_key, ID, key) %>%
spread(id_key, value)
Which gives:
# Code id1_Count id1_Value id2_Count id2_Value id3_Count id3_Value
#1 A 34 3 3 2 12 3
#2 B 65 1 8 3 NA NA
#3 C 21 8 NA NA 15 5
#4 D NA NA NA NA 16 8

Resources