Create new data frame with multiple subsets of same variable - r

I'd like to create a new data frame where the columns are subsets of the same variable that are split by a different variable. For example, I'd like to make a new subset of variable ('b') where the columns are split by a subset of a different variable ('year')
set.seed(88)
df <- data.frame(year = rep(1996:1998,3), a = runif(9), b = runif(9), e = runif(9))
df
year a b e
1 1996 0.41050128 0.97679183 0.7477684
2 1997 0.10273570 0.54925568 0.7627982
3 1998 0.74104481 0.74416429 0.2114261
4 1996 0.48007870 0.55296210 0.7377032
5 1997 0.99051343 0.18097104 0.8404930
6 1998 0.99954223 0.02063662 0.9153588
7 1996 0.03247379 0.33055434 0.9182541
8 1997 0.76020784 0.10246882 0.7055694
9 1998 0.67713100 0.59292207 0.4093590
Desired output for variable 'b' for years 1996 and 1998, is:
V1 V2
1 0.9767918 0.74416429
2 0.5529621 0.02063662
3 0.3305543 0.59292207
I could probably find a way to do this with a loop, but am wondering if there is a dplyr methed (or any simple method to accomplish this).

We subset dataset based on 1996, 1998 in 'year', select the 'year', 'b' columns and unstack to get the expected output
unstack(subset(df, year %in% c(1996, 1998), select = c('year', 'b')), b ~ year)
# X1996 X1998
#1 0.9767918 0.74416429
#2 0.5529621 0.02063662
##3 0.3305543 0.59292207
Or using tidyverse, we select the columns of interest, filter the rows based on the 'year' column, create a sequence column by 'year', spread to 'wide' format and select out the unwanted columns
library(tidyverse)
df %>%
select(year, b) %>%
filter(year %in% c(1996, 1998)) %>%
group_by(year = factor(year, levels = unique(year), labels = c('V1', 'V2'))) %>%
mutate(n = row_number()) %>%
spread(year, b) %>%
select(-n)
# A tibble: 3 x 2
# V1 V2
# <dbl> <dbl>
#1 0.977 0.744
#2 0.553 0.0206
#3 0.331 0.593
As there are only two 'year's, we can also use summarise
df %>%
summarise(V1 = list(b[year == 1996]), V2 = list(b[year == 1998])) %>%
unnest

Another option with dplyr, mixing in some base R, resulting in a tiny bit shorter solution than #akrun's code:
bind_cols(split(df$b, df$year)) %>% select(-'1997')
# A tibble: 3 x 2
`1996` `1998`
<dbl> <dbl>
1 0.977 0.744
2 0.553 0.0206
3 0.331 0.593

Related

Replacing NA values with mode from multiple imputation in R

I ran 5 imputations on a data set with missing values. For my purposes, I want to replace missing values with the mode from the 5 imputations. Let's say I have the following data sets, where df is my original data, ID is a grouping variable to identify each case, and imp is my imputed data:
df <- data.frame(ID = c(1,2,3,4,5),
var1 = c(1,NA,3,6,NA),
var2 = c(NA,1,2,6,6),
var3 = c(NA,2,NA,4,3))
imp <- data.frame(ID = c(1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4,5,5,5,5,5),
var1 = c(1,2,3,3,2,5,4,5,6,6,7,2,3,2,5,6,5,6,6,6,3,1,2,3,2),
var2 = c(4,3,2,3,2,4,6,5,4,4,7,2,4,2,3,6,5,6,4,5,3,3,4,3,2),
var3 = c(7,6,5,6,6,2,3,2,4,2,5,4,5,3,5,1,2,1,3,2,1,2,1,1,1))
I have a method that works, but it involves a ton of manual coding as I have ~200 variables total (I'm doing this on 3 different data sets with different variables). My code looks like this for one variable:
library(dplyr)
mode <- function(codes){
which.max(tabulate(codes))
}
var1 <- imp %>% group_by(ID) %>% summarise(var1 = mode(var1))
df3 <- df %>%
left_join(var1, by = "ID") %>%
mutate(var1 = coalesce(var1.x, var1.y)) %>%
select(-var1.x, -var1.y)
Thus, the original value in df is replaced with the mode only if the value was NA.
It is taking forever to keep manually coding this for every variable. I'm hoping there is an easier way of calculating the mode from the imputed data set for each variable by ID and then replacing the NAs with that mode in the original data. I thought maybe I could put the variable names in a vector and somehow iterate through them with one code where i changes to each variable name, but I didn't know where to go with that idea.
x <- colnames(df)
# Attempting to iterate through variables names using i
i = as.factor(x[[2]])
This is where I am stuck. Any help is much appreciated!
Here is one option using tidyverse. Essentially, we can pivot both dataframes long, then join together and coalesce in one step rather than column by column. Mode function taken from here.
library(tidyverse)
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
imp_long <- imp %>%
group_by(ID) %>%
summarise(across(everything(), Mode)) %>%
pivot_longer(-ID)
df %>%
pivot_longer(-ID) %>%
left_join(imp_long, by = c("ID", "name")) %>%
mutate(var1 = coalesce(value.x, value.y)) %>%
select(-c(value.x, value.y)) %>%
pivot_wider(names_from = "name", values_from = "var1")
Output
# A tibble: 5 × 4
ID var1 var2 var3
<dbl> <dbl> <dbl> <dbl>
1 1 1 3 6
2 2 5 1 2
3 3 3 2 5
4 4 6 6 4
5 5 3 6 3
You can use -
library(dplyr)
mode_data <- imp %>%
group_by(ID) %>%
summarise(across(starts_with('var'), Mode))
df %>%
left_join(mode_data, by = 'ID') %>%
transmute(ID,
across(matches('\\.x$'),
function(x) coalesce(x, .[[sub('x$', 'y', cur_column())]]),
.names = '{sub(".x$", "", .col)}'))
# ID var1 var2 var3
#1 1 1 3 6
#2 2 5 1 2
#3 3 3 2 5
#4 4 6 6 4
#5 5 3 6 3
mode_data has Mode value for each of the var columns.
Join df and mode_data by ID.
Since all the pairs have name.x and name.y in their name, we can take all the name.x pairs replace x with y to get corresponding pair of columns. (.[[sub('x$', 'y', cur_column())]])
Use coalesce to select the non-NA value in each pair.
Change the column name by removing .x from the name. ({sub(".x$", "", .col)}) so var1.x becomes only var1.
where Mode function is taken from here
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
library(dplyr, warn.conflicts = FALSE)
imp %>%
group_by(ID) %>%
summarise(across(everything(), Mode)) %>%
bind_rows(df) %>%
group_by(ID) %>%
summarise(across(everything(), ~ coalesce(last(.x), first(.x))))
#> # A tibble: 5 × 4
#> ID var1 var2 var3
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 3 6
#> 2 2 5 1 2
#> 3 3 3 2 5
#> 4 4 6 6 4
#> 5 5 3 6 3
Created on 2022-01-03 by the reprex package (v2.0.1)
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}

Identifying values from one database to use in another database

I am working on a project in which I need to work with 2 databases, identify values from one database to use in another.
I have a dataframe 1,
df1<-data.frame("ID"=c(1,2,3),"Condition A"=c("B","B","A"),"Condition B"=c("1","1","2"),"Year"=c(2002,1988,1995))
and a dataframe 2,
df2 <- data.frame("Condition A"=c("A","A","B","B"),"Condiction B"=c("1","2","1","2"),"<1990"=c(20,30,50,80),"1990-2000"=c(100,90,80,30),">2000"=c(300,200,800,400))
I would like to add a new column to df1 called "Value", in which, for each ID (from df1), collects the values from column 3,4 or 5 from df2 (depending on the year), and following conditions A and B available in both databases. The end result would be something like this:
df1<-data.frame("ID"=c(1,2,3),"Condition A"=c("B","B","A"),"Condition B"=c("1","1","2"),"Year"=c(2002,1988,1995),"Value"=c(800,50,90))
thanks!
I think we can simply left_join, then mutate with case_when, then drop the undesired columns with select:
library(dplyr)
left_join(df1, df2, by=c("Condition.A", "Condition.B"))%>%
mutate(Value=case_when(Year<1990 ~ X.1990,
Year<2000 ~ X1990.2000,
Year>=2000 ~ X.2000))%>%
select(-starts_with("X"))
ID Condition.A Condition.B Year Value
1 1 B 1 2002 800
2 2 B 1 1988 50
3 3 A 2 1995 90
EDIT: I edited your code, removing the "Condiction" typo
You could use
library(dplyr)
library(tidyr)
df2 %>%
rename(Condition.B = Condiction.B) %>%
pivot_longer(matches("\\d+{4}")) %>%
right_join(df1, by = c("Condition.A", "Condition.B")) %>%
filter(name == case_when(
Year < 1990 ~ "X.1990",
Year > 2000 ~ "X.2000",
TRUE ~ "X1990.2000")) %>%
select(ID, Condition.A, Condition.B, Year, Value = value) %>%
arrange(ID)
This returns
# A tibble: 3 x 5
ID Condition.A Condition.B Year Value
<dbl> <chr> <chr> <dbl> <dbl>
1 1 B 1 2002 800
2 2 B 1 1988 50
3 3 A 2 1995 90
At first we rename the misspelled column Condiction.B of df2 and bring it into a "long format" based on the "<1990", "1990-2000", ">2000" columns. Note that those columns can't be named like this, they are automatically renamed to X.1990, X1990.2000 and X.2000.
Next we use a right join with df1 on the two Condition columns.
Finally we filter just the matching years based on a hard coded case_when function and do some clean up (selecting and arranging).
We could do it this way:
Condiction must be a typo so I changed it to Condition
in df1 create a helper column that assigns each your to the group which is a column name in df2
bring df2 in long format
finally apply left_join by by=c("Condition.A", "Condition.B", "helper"="name")
library(dplyr)
library(tidyr)
df1 <- df1 %>%
mutate(helper = case_when(Year >=1990 & Year <=2000 ~"X1990.2000",
Year <1990 ~ "X.1990",
Year >2000 ~ "X.2000"))
df2 <- df2 %>%
pivot_longer(
cols=starts_with("X")
)
df3 <- left_join(df1, df2, by=c("Condition.A", "Condition.B", "helper"="name")) %>%
select(-helper)
ID Condition.A Condition.B Year value
1 1 B 1 2002 800
2 2 B 1 1988 50
3 3 A 2 1995 90

How to get the difference of a lagged variable by date?

Consider the following example:
library(tidyverse)
library(lubridate)
df = tibble(client_id = rep(1:3, each=24),
date = rep(seq(ymd("2016-01-01"), (ymd("2016-12-01") + years(1)), by='month'), 3),
expenditure = runif(72))
In df you have stored information on monthly expenditure from a bunch of clients for the past 2 years. Now you want to calculate the monthly difference between this year and the previous year for each client.
Is there any way of doing this maintaining the "long" format of the dataset? Here I show you the way I am doing it nowadays, which implies going wide:
df2 = df %>%
mutate(date2 = paste0('val_',
year(date),
formatC(month(date), width=2, flag="0"))) %>%
select(client_id, date2, value) %>%
pivot_wider(names_from = date2,
values_from = value)
df3 = (df2[,2:13] - df2[,14:25])
However I find tihs unnecessary complex, and in large datasets going from long to wide can take quite a lot of time, so I think there must be a better way of doing it.
If you want to keep data in long format, one way would be to group by month and date value for each client_id and calculate the difference using diff.
library(dplyr)
df %>%
group_by(client_id, month_date = format(date, "%m-%d")) %>%
summarise(diff = -diff(expenditure))
# client_id month_date diff
# <int> <chr> <dbl>
# 1 1 01-01 0.278
# 2 1 02-01 -0.0421
# 3 1 03-01 0.0117
# 4 1 04-01 -0.0440
# 5 1 05-01 0.855
# 6 1 06-01 0.354
# 7 1 07-01 -0.226
# 8 1 08-01 0.506
# 9 1 09-01 0.119
#10 1 10-01 0.00819
# … with 26 more rows
An option with data.table
library(data.table)
library(zoo)
setDT(df)[, .(diff = -diff(expenditure)), .(client_id, month_date = as.yearmon(date))]

How to work with special row after group_by in R

I have a data frame something like bellow:
amount <- sample(10000:2000, 20)
year<- sample(2015:2017, 20, replace = TRUE)
company<- sample(LETTERS[1:3],20, replace = TRUE)
df<-data.frame(company, year, amount)
Then I want to group by company and year so I have:
df %>%
group_by(company, year) %>%
summarise(
total= sum(amount)
)
company year total
<fct> <int> <int>
1 A 2015 1094
2 A 2016 3308
3 A 2017 4785
4 B 2015 1190
5 B 2016 6583
6 B 2017 1964
7 C 2015 4974
8 C 2016 1986
9 C 2017 3465
Now, I want to divide the last row in each group to the first row. In other words, I want to divide the total value for the last year for each company to the same value of the first year.
Thanks.
You could use last and first to access those elements of total respectively :
library(dplyr)
df %>%
group_by(company, year) %>%
summarise(total= sum(amount)) %>%
summarise(final = last(total)/first(total))
# company final
# <fct> <dbl>
#1 A 2.26
#2 B 1.92
#3 C 0.565
In base R, we can use aggregate
aggregate(amount~company, aggregate(amount~company+year, df, sum),
function(x) x[length(x)]/x[1])
# company amount
#1 A 2.262524
#2 B 1.919138
#3 C 0.565281
With data.table, we can do
library(data.table)
setDT(df)[ , .(total = sum(amount)), .(company, year)][,
.(final = last(total)/first(total)), .(company)]

rowwise correlation for specific columns

For all rows (Symbols) i would like to correlated the five columns F1_G-F5_G with the five columns F1_P-F5_P. This should give one correlation value for each Symbol.
Symbol F1_G F2_G F3_G F4_G F5_G F1_P F2_P F3_P F4_P F5_P
1 abca2 0.7696639 1.301428 0.8447565 0.6936672 0.6987410 9.873610 9.705205 8.044027 8.311364 9.961380
2 aco2 7.4274715 7.234892 7.8543164 8.0142983 8.1512194 9.620114 9.767721 7.607115 7.854960 9.472660
3 adat3 -2.0560126 -1.536868 -0.4181457 -1.1946602 -0.7707472 8.975871 8.645235 7.926262 7.432755 8.633583
4 adat3 -2.0560126 -1.536868 -0.4181457 -1.1946602 -0.7707472 9.620114 9.237699 7.162386 7.972086 8.872763
5 adnp 1.4228436 0.932214 0.8964153 0.8125162 0.9921002 9.177645 9.323443 8.507508 8.080413 8.633583
6 arhgap8 -2.6517712 -2.067164 -1.4918958 -2.6517712 -1.5474257 9.395681 8.861322 8.333381 8.038053 8.872763
I tried something like this, but is does not consider each row:
res <- outer(df[, c(2,3,4,5,6)], df[, c(7,8,9,10,11)], function(X, Y){
mapply(function(...) cor.test(..., na.action = "na.exclude")$estimate,
X, Y)
})
out:
Symbol Cor
abca2 0.14
aco2 0.12
Since you want to do it row-wise we can use apply with MARGIN = 1
#Get column indices ending with G
g_cols <- grep("G$", names(df))
#Get column indices ending with P
p_cols <- grep("P$", names(df))
apply(df, 1, function(x) cor.test(as.numeric(x[g_cols]),
as.numeric(x[p_cols]), na.action = "na.exclude")$estimate)
# 1 2 3 4 5 6
# 0.21890 -0.52925 -0.52776 -0.82073 0.60473 -0.11785
A tidyverse approach would be
library(tidyverse)
df %>%
mutate(row = row_number()) %>%
select(-Symbol) %>%
gather(key, value, -row) %>%
group_by(row) %>%
summarise(ans = cor.test(value[key %in% g_cols], value[key %in% p_cols],
na.action = "na.exclude")$estimate)
# row ans
# <int> <dbl>
#1 1 0.219
#2 2 -0.529
#3 3 -0.528
#4 4 -0.821
#5 5 0.605
#6 6 -0.118

Resources