Related
The code posted at the bottom does a nice job of filling in a dataframe, using package tidyr, so that all ID's end up with the same number of periods, in the case of period defined as number of months ("Period_1" in the below code). Base dataframe testDF has ID of 1 with 5 periods, and ID of 50 and 60 with only 3 periods each. The tidyr code creates additional periods ("Period_1") for ID of 50 and 60 so they too have 5 Period_1´s. The code copies down the "Bal" and "State" fields so that all ID end up with the same number of Period_1, which is correct.
However, how would I extend the calendar month expression of "Period_2" in the same manner, as illustrated immediately below?
Code:
library(tidyr)
testDF <-
data.frame(
ID = as.numeric(c(rep(1,5),rep(50,3),rep(60,3))),
Period_1 = as.numeric(c(1:5,1:3,1:3)),
Period_2 = c("2012-06","2012-07","2012-08","2012-09","2012-10","2013-06","2013-07","2013-08","2012-01","2012-02","2012-03"),
Bal = as.numeric(c(rep(10,5),21:23,36:34)),
State = c("XX","AA","BB","CC","XX","AA","BB","CC","SS","XX","AA")
)
testDFextend <-
testDF %>%
tidyr::complete(ID, nesting(Period_1)) %>%
tidyr::fill(Bal, State, .direction = "down")
testDFextend
Edit: rolling from one year to the next
A better OP example would have Period 2 = c("2012-06","2012-07","2012-08","2012-09","2012-10","2013-06","2013-07","2013-08","2012-10","2012-11","2012-12"), providing an example whereby extending Period_2 causes a rollover to the next year. Below I add to the tidyr/dplyr answer below to correctly roll over the year:
library(tidyr)
library(dplyr)
testDF <-
data.frame(
ID = as.numeric(c(rep(1,5),rep(50,3),rep(60,3))),
Period_1 = as.numeric(c(1:5,1:3,1:3)),
Period_2 = c("2012-06","2012-07","2012-08","2012-09","2012-10","2013-06","2013-07","2013-08","2012-10","2012-11","2012-12"),
Bal = as.numeric(c(rep(10,5),21:23,36:34)),
State = c("XX","AA","BB","CC","XX","AA","BB","CC","SS","XX","AA")
)
testDFextend <-
testDF %>%
tidyr::complete(ID, nesting(Period_1)) %>%
tidyr::fill(Bal, State, .direction = "down")
testDFextend %>%
separate(Period_2, into = c("year", "month"), convert = TRUE) %>%
fill(year) %>%
group_by(ID) %>%
mutate(month = sprintf("%02d", zoo::na.spline(month))) %>%
unite("Period_2", year, month, sep = "-") %>%
# Now I add the below lines:
separate(Period_2, into = c("year", "month"), convert = TRUE) %>%
mutate(month = as.integer(sprintf("%02d", zoo::na.spline(month)))) %>%
mutate(year1 = ifelse(month > 12, year+trunc(month/12), year)) %>%
mutate(month1 = ifelse(month > 12 & month%%12!= 0, month%%12, month)) %>%
mutate(month1 = ifelse(month1 < 10, paste0(0,month1),month1)) %>%
unite("Period_2", year1, month1, sep = "-") %>%
select("ID","Period_1","Period_2","Bal","State")
A tidyverse solution based on zoo::na.spline. Note that it does not handle year changes. It's harder than I thought, especially because zoo::na.spline does not seem to work on yearmon format.
library(tidyr)
library(dplyr)
testDFextend %>%
separate(Period_2, into = c("year", "month"), convert = TRUE) %>%
fill(year) %>%
group_by(ID) %>%
mutate(month = sprintf("%02d", zoo::na.spline(month))) %>%
unite("Period_2", year, month, sep = "-")
output
ID Period_1 Period_2 Bal State
<dbl> <dbl> <chr> <dbl> <chr>
1 1 1 2012-06 10 XX
2 1 2 2012-07 10 AA
3 1 3 2012-08 10 BB
4 1 4 2012-09 10 CC
5 1 5 2012-10 10 XX
6 50 1 2013-06 21 AA
7 50 2 2013-07 22 BB
8 50 3 2013-08 23 CC
9 50 4 2013-09 23 CC
10 50 5 2013-10 23 CC
11 60 1 2012-01 36 SS
12 60 2 2012-02 35 XX
13 60 3 2012-03 34 AA
14 60 4 2012-04 34 AA
15 60 5 2012-05 34 AA
by ID you can strsplit the date, and take the elements to create a new data.frame to merge with.
ml <- max(with(testDF, tapply(ID, ID, length))) ## get max. period length
by(testDF, testDF$ID, \(x) {
sp <- strsplit(x$Period_2, '-')
s <- as.numeric(sp[[1]][[2]])
if (ml != nrow(x))
merge(x, data.frame(Period_2=paste0(sp[[1]][[1]], '-', sprintf('%02d', (s + nrow(x)):(s + ml - 1))),
Period_1=(nrow(x) + 1):ml,
ID=x$ID[nrow(x)], Bal=x$Bal[nrow(x)], State=x$State[nrow(x)]), all=TRUE)
else x
}) |> c(make.row.names=FALSE) |> do.call(what=rbind)
# ID Period_1 Period_2 Bal State
# 1 1 1 2012-06 10 XX
# 2 1 2 2012-07 10 AA
# 3 1 3 2012-08 10 BB
# 4 1 4 2012-09 10 CC
# 5 1 5 2012-10 10 XX
# 6 50 1 2013-06 21 AA
# 7 50 2 2013-07 22 BB
# 8 50 3 2013-08 23 CC
# 9 50 4 2013-09 23 CC
# 10 50 5 2013-10 23 CC
# 11 60 1 2012-01 36 SS
# 12 60 2 2012-02 35 XX
# 13 60 3 2012-03 34 AA
# 14 60 4 2012-04 34 AA
# 15 60 5 2012-05 34 AA
Edit
For older R versions (although it's recommended to always use update software), do:
do.call(c(by(testDF, testDF$ID, function(x) {
sp <- strsplit(x$Period_2, '-')
s <- as.numeric(sp[[1]][[2]])
if (ml != nrow(x))
merge(x, data.frame(Period_2=paste0(sp[[1]][[1]], '-', sprintf('%02d', (s + nrow(x)):(s + ml - 1))),
Period_1=(nrow(x) + 1):ml,
ID=x$ID[nrow(x)], Bal=x$Bal[nrow(x)], State=x$State[nrow(x)]), all=TRUE)
else x
}), make.row.names=FALSE), what=rbind)
For each ID convert Period_2 to yearmon class. This represents year and month without day. Internally it uses year + fraction where fraction = 0, 1/12, ..., 11/12 for the 12 months. Expand it out using seq. Then convert it back to character or omit the format line to keep the result as a yearmon object.
library(dplyr, exclude = c("filter", "lag"))
library(zoo)
testDFextend %>%
group_by(ID) %>%
mutate(Period_2 = as.yearmon(first(Period_2)) + seq(0, by=1/12, length=n())) %>%
mutate(Period_2 = format(Period_2, "%Y-%m")) %>%
ungroup
giving:
# A tibble: 15 × 5
ID Period_1 Period_2 Bal State
<dbl> <dbl> <chr> <dbl> <chr>
1 1 1 2012-06 10 XX
2 1 2 2012-07 10 AA
3 1 3 2012-08 10 BB
4 1 4 2012-09 10 CC
5 1 5 2012-10 10 XX
6 50 1 2013-06 21 AA
7 50 2 2013-07 22 BB
8 50 3 2013-08 23 CC
9 50 4 2013-09 23 CC
10 50 5 2013-10 23 CC
11 60 1 2012-01 36 SS
12 60 2 2012-02 35 XX
13 60 3 2012-03 34 AA
14 60 4 2012-04 34 AA
15 60 5 2012-05 34 AA
I think the nicest way to do this is to make use of the padr package, which is built to pad data.frames where there are missing/incomplete columns.
This uses grouping and cur_data() to make the correct date sequence in Period_2.
library(dplyr)
library(tidyr)
library(padr)
n_periods <- 5
testDF %>%
pad_int(end_val = n_periods , by = "Period_1", group = "ID") %>%
group_by(ID) %>%
mutate(Period_2 = as.Date(paste0(Period_2, "-01"))) %>%
mutate(Period_2 = seq(cur_data()$Period_2[1], by = "months", length.out =
n_periods) %>% format("%Y-%m")) %>%
fill(Bal, State) %>%
ungroup() %>%
select(ID, Period_1, Period_2, Bal, State)
ID Period_1 Period_2 Bal State
<dbl> <dbl> <chr> <dbl> <chr>
1 1 1 2012-06 10 XX
2 1 2 2012-07 10 AA
3 1 3 2012-08 10 BB
4 1 4 2012-09 10 CC
5 1 5 2012-10 10 XX
6 50 1 2013-06 21 AA
7 50 2 2013-07 22 BB
8 50 3 2013-08 23 CC
9 50 4 2013-09 23 CC
10 50 5 2013-10 23 CC
11 60 1 2012-01 36 SS
12 60 2 2012-02 35 XX
13 60 3 2012-03 34 AA
14 60 4 2012-04 34 AA
15 60 5 2012-05 34 AA
Note that this will handle cases when the year rolls over to the next year during Period_2.
Finally, you could adjust n_periods if you needed a different number of periods (or use a function to figure it out automatically, like jay.sf's answer).
I have a summary table I need to expand into unit-level observations to run test statistics on.
The summary table looks like this
tbl_summmary <-
tibble(
outcome = c("A (%)", "B (%)", "C (%)", "D (%)", "Total (n)"),
group_1 = c(.25, .25, .125, .325, 10),
group_2 = c(.50, 0.0, .325, .125, 20),
group_3 = c(.10, .40, .125, .325, 40))
tbl_summmary
The result needs to be a long format dataframe of the data described in the summary table.
df_simulation <-
bind_rows(
tibble(
group = 1,
outcome = c(
rep("A", .25*10),
rep("B", .25*10),
rep("C", .125*10),
rep("D", .325*10))),
tibble(
group = 2,
outcome = c(
rep("A", .50*20),
rep("B", .00*20),
rep("C", .325*20),
rep("D", .125*20))),
tibble(
group = 3,
outcome = c(
rep("A", .10*40),
rep("B", .40*40),
rep("C", .125*40),
rep("D", .325*40))))
df_simulation
However, the code needs to iterate over multiple variables and multiple groups. Typing out each group and variable manually like in the above won't be scalable. A way of doing this programmatically would be much appreciated!
One option would be to use tidyr::uncount after some additional data wrangling steps:
library(tidyr)
library(dplyr)
# Get df of totals
totals <- tbl_summmary[nrow(tbl_summmary),] |>
pivot_longer(-outcome, names_to = "group", names_prefix = "group_") |>
select(group, total = value)
# Get df of outcomes
outcomes <- tbl_summmary[-nrow(tbl_summmary),] |>
mutate(outcome = gsub("^(\\w+).*$", "\\1", outcome)) |>
pivot_longer(-outcome, names_to = "group", names_prefix = "group_") |>
left_join(totals, by = "group") |>
# We need integers, so use round
mutate(n = round(value * total)) |>
select(group, outcome, n) |>
uncount(n) |>
mutate(group = as.numeric(group)) |>
arrange(group, outcome)
count(outcomes, outcome, group)
#> # A tibble: 11 × 3
#> outcome group n
#> <chr> <dbl> <int>
#> 1 A 1 2
#> 2 A 2 10
#> 3 A 3 4
#> 4 B 1 2
#> 5 B 3 16
#> 6 C 1 1
#> 7 C 2 6
#> 8 C 3 5
#> 9 D 1 3
#> 10 D 2 2
#> 11 D 3 13
identical(df_simulation, outcomes)
#> [1] TRUE
I do like the approach by #stefan.
If you want the format you requested, it can be done by listing the outcome values by the total amount then unnesting the values, then putting it into long format.
library(dplyr)
library(tidyr)
library(stringr)
df <- tbl_summmary |>
mutate(across(starts_with("group"), ~ .x *.x[5])) |>
rowwise() |>
mutate(across(starts_with("group"), ~
ifelse(.x != 0,
list(rep(str_remove(outcome, " \\(%\\)"), .x)),
list(NA_character_))))
df[-5, -1] |>
unnest_wider(col = everything(), names_sep = "_") |>
stack() |>
mutate(ind = str_extract(str_remove(ind, "_\\d+$"), "\\d"), .before = 1) |>
rename(group = ind, outcome = values) |>
drop_na()
outcome group
1 A 1
2 B 1
3 C 1
4 D 1
5 A 1
6 B 1
7 D 1
8 D 1
9 A 2
10 C 2
11 D 2
12 A 2
13 C 2
14 D 2
15 A 2
16 C 2
17 A 2
18 C 2
19 A 2
20 C 2
21 A 2
22 C 2
23 A 2
24 A 2
25 A 2
26 A 2
27 A 3
28 B 3
29 C 3
30 D 3
31 A 3
32 B 3
33 C 3
34 D 3
35 A 3
36 B 3
37 C 3
38 D 3
39 A 3
40 B 3
41 C 3
42 D 3
43 B 3
44 C 3
45 D 3
46 B 3
47 D 3
48 B 3
49 D 3
50 B 3
51 D 3
52 B 3
53 D 3
54 B 3
55 D 3
56 B 3
57 D 3
58 B 3
59 D 3
60 B 3
61 D 3
62 B 3
63 B 3
64 B 3
I have a Dataset in R like this (my real dataset has more rows and columns):
AB1
AB3
AB4
XB1
XB3
XB4
12
34
0
5
3
7
I need to sum the column similar like
AB1+XB1 AB3+XB3 AB4+XB4
What is the code I can use?
You could use
library(dplyr)
df %>%
mutate(across(starts_with("AB"),
~.x + df[[gsub("AB", "XB", cur_column())]],
.names = "sum_{.col}"))
This returns
# A tibble: 1 x 9
AB1 AB3 AB4 XB1 XB3 XB4 sum_AB1 sum_AB3 sum_AB4
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 12 34 0 5 3 7 17 37 7
We use across and mutate in this approach.
First we select all columns starting with AB. The desired sums are always ABn + XB2, so we can use this pattern.
Next we replace AB in the name of the current selected column with XB and sum those two columns. These sums are stored in a new column prefixed with sum_.
Assuming it is the first character that changes and the others are used to group
df=read.table(text="
AB1 AB3 AB4 XB1 XB3 XB4
12 34 0 5 3 7
11 35 1 7 2 8",h=T)
sapply(
unique(substr(colnames(df),2,100)),
function(x){
rowSums(df[,grepl(x,colnames(df))])
}
)
B1 B3 B4
[1,] 17 37 7
[2,] 18 37 9
We can try the code below
cbind(
df,
list2DF(lapply(
split.default(df, gsub("\\D+", "", names(df))),
rowSums
))
)
which gives
AB1 AB3 AB4 XB1 XB3 XB4 1 3 4
1 12 34 0 5 3 7 17 37 7
Try this:
library(tidyverse)
tribble(
~AB1, ~AB3, ~AB4, ~XB1, ~XB3, ~XB4,
12, 34, 0, 5, 3, 7
) |>
pivot_longer(everything(), names_pattern = "(\\w\\w)(\\d)", names_to = c("prefix", "suffix")) |>
pivot_wider(names_from = prefix) |>
rowwise() |>
mutate(sum = sum(c_across(- suffix)))
#> # A tibble: 3 × 4
#> # Rowwise:
#> suffix AB XB sum
#> <chr> <dbl> <dbl> <dbl>
#> 1 1 12 5 17
#> 2 3 34 3 37
#> 3 4 0 7 7
Created on 2022-05-11 by the reprex package (v2.0.1)
If you know that structure is consistent (an "A" and "X" pair for everything), then this should work.
cols <- unique(substring(names(df), 2))
df[paste0("A", cols)] + df[paste0("X", cols)]
Using the 2 row DF2 in the Note as input calculate the suffixes (s), unique suffixes (u) and perform the indicated matrix multiplication giving (m). Finally convert that back to a data frame and set the names. No packages are used.
s <- substring(names(DF2), 2)
u <- unique(s)
m <- as.matrix(DF2) %*% outer(s, u, `==`)
sums <- setNames(as.data.frame(m), u); sums
## B1 B3 B4
## 1 17 37 7
## 2 17 37 7
If it is desired to append these as columns to DF2 then:
data.frame(DF2, sum = sums)
## AB1 AB3 AB4 XB1 XB3 XB4 sum.B1 sum.B3 sum.B4
## 1 12 34 0 5 3 7 17 37 7
## 2 12 34 0 5 3 7 17 37 7
Note
DF <- structure(list(AB1 = 12L, AB3 = 34L, AB4 = 0L, XB1 = 5L, XB3 = 3L,
XB4 = 7L), class = "data.frame", row.names = c(NA, -1L))
DF2 <- rbind(DF, DF)
DF2
## AB1 AB3 AB4 XB1 XB3 XB4
## 1 12 34 0 5 3 7
## 2 12 34 0 5 3 7
An option with across2 from dplyover
library(dplyover)
df1 %>%
mutate(across2(starts_with('AB'), starts_with('XB'),
~ .x + .y, .names = "sum_{xcol}"))
AB1 AB3 AB4 XB1 XB3 XB4 sum_AB1 sum_AB3 sum_AB4
1 12 34 0 5 3 7 17 37 7
I have two dataframes df1 and df2, I am looking for the simplest operation to get df3.
I want to replace rows in df1 with rows from df2 if id match (so rbind.fill is not a solution), and append rows from df2 where id does not exist in df1but only for columns that exist in df2.
I guess I could use several joins and antijoins and then merge but I wonder if there already exists a function for that operation.
df1 <- data.frame(id = 1:5, c1 = 11:15, c2 = 16:20, c3 = 21:25)
df2 <- data.frame(id = 4:7, c1 = 1:4, c2 = 5:8)
df1
id c1 c2 c3
1 11 16 21
2 12 17 22
3 13 18 23
4 14 19 24
5 15 20 25
df2
id c1 c2
4 1 5
5 2 6
6 3 7
7 4 8
df3
id c1 c2 c3
1 11 16 21
2 12 17 22
3 13 18 23
4 1 5 24
5 2 6 25
6 3 7 NULL
7 4 8 NULL
We can use {powerjoin}, make a full join and deal with the conflicts using coalesce_xy (which is really dplyr::coalesce) :
library(powerjoin)
df1 <- data.frame(id = 1:5, c1 = 11:15, c2 = 16:20, c3 = 21:25)
df2 <- data.frame(id = 4:7, c1 = 1:4, c2 = 5:8)
safe_full_join(df1, df2, by= "id", conflict = coalesce_xy)
# id c1 c2 c3
# 1 1 11 16 21
# 2 2 12 17 22
# 3 3 13 18 23
# 4 4 14 19 24
# 5 5 15 20 25
# 6 6 3 7 NA
# 7 7 4 8 NA
I ended up with :
special_combine <- function(df1, df2){
df1_int <- df1[, colnames(df1) %in% colnames(df2)]
df1_ext <- df1[, c("id", colnames(df1)[!colnames(df1) %in% colnames(df2)])]
df3 <- bind_rows(df1_int, df2)
df3 <- df3[!duplicated(df3$id, fromLast=TRUE), ] %>%
dplyr::left_join(df1_ext, by="id") %>%
dplyr::arrange(id)
df3
}
I have two data frame as follows:
df1<-data.frame(st=c(1,2,3,4),v1=c(12,14,15,75),v2=c(43,32,12,18))
df1
st v1 v2
1 1 12 43
2 2 14 32
3 3 15 12
4 4 75 18
df2<-data.frame(st=c(1,2,3,4),v1=c(12,24,35,18),v2=c(48,32,121,82),v3=c(53,11,12,75))
df2
st v1 v2 v3
1 1 12 48 53
2 2 24 32 11
3 3 35 121 12
4 4 18 82 75
What i want is to match both the data frame at a "st" column level i.e. for st = 1 in df1 the corresponding values for v1 and v2 are 12 & 43. So if for st= 1 in df2 if any of the variables contain these values then I want to select st, and those values from df2.
So for the above example the output will be
St values
1 12(coming from v1 in df2)
2 32(coming from v2 in df2)
3 12(coming from v3 in df2)
4 18 75(coming from v1 & v3 in df2)
The important thing to note is, in the output data frame the order of selected variables should be as that of df2, as you can see that for st = 4, the values in df1 are 75 & 18 which matches with st = 2 but still the output is 18 and then 75 which is the order in df2. Also the variables in df2 will always be greater than df1.
If I understand you correctly...
Step 0. prepare data
You mentioned that you only want to select rows that fit your conditions, but the sample dataset has at least one match in each row. I tweaked it such that there's no match for St=3, to demonstrate the that the row will not be returned in the result.
df1<-data.frame(st=c(1,2,3,4),v1=c(12,14,15,75),v2=c(43,32,12,18))
df2<-data.frame(st=c(1,2,3,4),v1=c(12,24,35,18),v2=c(48,32,121,82),v3=c(53,11,13,75))
Step 1. combine the datasets
combined.df <- rbind(df1 %>% gather(v, n, -st) %>% mutate(df = "df1"),
df2 %>% gather(v, n, -st) %>% mutate(df = "df2"))
> head(combined.df)
st v n df
1 1 v1 12 df1
2 2 v1 14 df1
3 3 v1 15 df1
4 4 v1 75 df1
5 1 v2 43 df1
6 2 v2 32 df1
Step 2. compare & keep only matched ones from df2
res <- combined.df %>%
group_by(st) %>%
mutate(n = ifelse(df=="df1", n, ifelse(n %in% n[df=="df1"], n, NA))) %>%
ungroup() %>%
filter(df=="df2", !is.na(n)) %>%
arrange(st, v)
# if you just want the values, you can stop here.
> res
# A tibble: 4 × 4
st v n df
<dbl> <chr> <dbl> <chr>
1 1 v1 12 df2
2 2 v2 32 df2
3 4 v1 18 df2
4 4 v3 75 df2
# this part formats the result to follow that of the desired output
res <- res %>%
group_by(st) %>%
summarise(values = paste(as.character(n), collapse = " ")) %>%
ungroup()
> res
# A tibble: 3 × 2
st values
<dbl> <chr>
1 1 12
2 2 32
3 4 18 75
If you use merge function, you can create a unique df with this matches:
new<-merge(df1,df2,by="st")
new
st v1.x v2.x v1.y v2.y v3
1 1 12 43 12 48 53
2 2 14 32 24 32 11
3 3 15 12 35 121 12
4 4 75 18 18 82 75
And if you want, you then can order it in the way you want. For example:
new2<-new[,1:2]
new2$from<-"from v1"
names(new2)<-c("st","value","from")
for(i in 3:ncol(new)){
new3<-new[,c(1,i)]
new3$from<-pasteo("from v",i)
names(new3)<-c("st","value","from")
new2<-rbind(new2,new3)
}
This is not the most efficient way, but if you have few data, it will work