I am trying to get the list of sums of two columns from my original data set, from left to right
I have made a loop:
for (i in 1:ncol(df)) {
m = i
n = i + 1
if (i %% 2 != 0) {
df_cum$V1 <- sum(df[,m] + df[,n])
}
}
But, the way to add value to the new list is wrong:
df_cum$V1 <- sum(df[,m] + df[,n])
would be really appreciated if anyone knows how to do that in R
You can try split.default(), i.e.
sapply(split.default(df, gsub('\\d+', '', names(df))), sum)
A B
17 12
A base R option using tapply -
tapply(unlist(df),
rep(1:ncol(df), each = nrow(df) * 2, length.out = nrow(df) * ncol(df)),
sum)
# 1 2 3
#17 12 13
The logic here is to create group of every 2 columns and sum them.
data
It is easier to help if you provide data in a reproducible format
df <- data.frame(A1 = c(0, 3, 2), A2 = c(2, 6, 4),
B1 = c(3, 0, 1), B2 = c(2, 3, 3),
C1 = c(7, 3, 2), C2 = c(1, 0, 0))
We can do this in tidyverse
library(dplyr)
library(tidyr)
df1 %>%
pivot_longer(everything(), names_to = c(".value", "grp"),
names_sep ="(?<=[A-Z])(?=[0-9])") %>%
select(-grp) %>%
summarise(across(everything(), sum, na.rm = TRUE), .groups = 'drop')
-output
# A tibble: 1 x 3
A B C
<dbl> <dbl> <dbl>
1 17 12 13
Or using base R
aggregate(values ~ ., transform(stack(df1),
ind = sub("\\d+", "", ind)), FUN = sum)
ind values
1 A 17
2 B 12
3 C 13
Or another option with rowsum from base R
with(stack(df1), rowsum(values, group = trimws(ind, whitespace = "\\d+")))
[,1]
A 17
B 12
C 13
Or another option is with colSums and rowsum
{tmp <- colSums(df1); rowsum(tmp, group = substr(names(tmp), 1, 1))}
[,1]
A 17
B 12
C 13
data
df1 <- structure(list(A1 = c(0, 3, 2), A2 = c(2, 6, 4), B1 = c(3, 0,
1), B2 = c(2, 3, 3), C1 = c(7, 3, 2), C2 = c(1, 0, 0)),
class = "data.frame", row.names = c(NA,
-3L))
Related
I have two table with a shared index, I want to divide one by another. This could be done with division on two data frames. But It seems arbitrary (how would I know I am dividing the right number?) and does not preserve index, so I want to do this division by matching rows with the same index. What's the best way to do this? Is there a best practice in terms of table division in this case?
tb1 <- data.frame(index = c(1, 2, 3), total_1 = c(100, 450, 300), total_2 = c(20, 39, 60))
tb2 <- data.frame(index = c(1, 2, 3), unit_1 = c(4, 2, 3), unit_2 = c(2, 3, 6))
tb1[,-1]/tb2[,-1]
total_1 total_2
1 25 10
2 225 13
3 100 10
Another case, two col of index must match.
tb2 <- data.frame(index_1 = c("a", "b", "b"), index_2 = c("c", "d", "b"), unit_1 = c(4, 2, 3), unit_2 = c(2, 3, 6))
tb1 <- data.frame(index_1 = c("a", "b", "b"), index_2 = c("c", "d", "b"), total_1 = c(100, 450, 300), total_2 = c(20, 39, 60))
If both data have the same index and the number of rows are same. One way is to order by 'index' in both data to enforce that they are in the same order. Then do the division
tb1new <- tb1[order(tb1$index),]
tbl2new <- tb2[order(tb2$index),]
tb1new[-1] <- tbl1new[-1]/tbl2new[-1]
Or we can make a check on both 'index' first and use that condition to do the division
i1 <- all.equal(tbl1$index, tbl2$index)
if(i1) tb1[-1]/tbl2[-1]
Or another option in a join
library(data.table)
nm1 <- c('total_1', 'total_2')
nm2 <- c('unit_1', 'unit_2')
setDT(tb1)[tb2, (nm1) := .SD/mget(nm2), on = .(index), .SDcols = nm1]
You can perform a join and divide the columns. In base R :
result <- merge(tb1, tb2, by = c('index_1', 'index_2'))
result
# index_1 index_2 total_1 total_2 unit_1 unit_2
#1 a c 100 20 4 2
#2 b b 300 60 3 6
#3 b d 450 39 2 3
total_cols <- grep('total', names(result), value = TRUE)
unit_cols <- grep('unit', names(result), value = TRUE)
result[total_cols]/result[unit_cols]
# total_1 total_2
#1 25 10
#2 100 10
#3 225 13
Maybe this is not the most efficient solution but here is another way:
library(dplyr)
library(tidyr)
# For one index matching
tb1 %>%
left_join(tb2, by = "index") %>%
mutate(result_1 = get(paste("total", 1, sep = "_")) / get(paste("unit", 1, sep = "_")),
result_2 = get(paste("total", 2, sep = "_")) / get(paste("unit", 2, sep = "_")))
index result_1 result_2
1 1 25 10
2 2 225 13
3 3 100 10
# For two indices matching
tb1 %>%
left_join(tb2, by = c("index_1", "index_2")) %>%
mutate(result_1 = get(paste("total", 1, sep = "_")) / get(paste("unit", 1, sep = "_")),
result_2 = get(paste("total", 2, sep = "_")) / get(paste("unit", 2, sep = "_"))) %>%
select(!starts_with(c("total", "unit")))
index_1 index_2 result_1 result_2
1 a c 25 10
2 b d 225 13
3 b b 100 10
I want to use dplyr programming syntax (combine !! and :=) to evaluate a function in .fn argument but failed.
The code like this:
library(zoo)
library(glue)
aa = structure(list(region = c(1, 2, 3, 4), co_mean = c(5, 5, 5, 5
), o3_mean = c(5, 5, 5, 5), pm2.5_mean = c(5, 5, 5, 5)), row.names = c(NA,
-4L), class = c("tbl_df", "tbl", "data.frame"))
for (i in 1:3) {
fun_name_1 = glue('lag{i}')
fun_name_2 = glue('lag0{i}')
aa = aa %>% group_by(region) %>%
mutate(across(.cols = contains('mean'),
.fns = list(!!fun_name_1 := ~lag(., i), # ERROR OCCUR AT HERE
!!fun_name_2 := ~ rollmeanr(., i)),
.names = '{.col}_{.fn}'))
aa
}
I don't know how to solve it.
Any help will be highly appreciated!
======UPDATE========
My new code and new ERROR:
library(zoo)
library(glue)
aa = structure(list(region = c(1, 2, 3, 4), co_mean = c(5, 5, 5, 5
), o3_mean = c(5, 5, 5, 5), pm2.5_mean = c(5, 5, 5, 5)), row.names = c(NA,
-4L), class = c("tbl_df", "tbl", "data.frame"))
for (i in 1:3) {
# i <- 1
fun_name_1 = glue('lag{i}')
fun_name_2 = glue('lag0{i}')
aa %>%
group_by(region) %>%
mutate(across(.cols = contains('mean'),
.fns = setNames(list(~lag(., i),
~ rollmeanr(., i)), c(fun_name_1, fun_name_2)),
.names = '{.col}_{.fn}'))
aa
}
# Error: Problem with `mutate()` input `..1`.
# x 'names' attribute [6] must be the same length as the vector [5]
# i Input `..1` is `across(...)`.
# i The error occurred in group 1: region = 1.
# Run `rlang::last_error()` to see where the error occurred.
It would work as a named list. It makes perfect sense to pass a group by first (assuming that the OP's original example data have multiple rows per group)
i <- 1
fun_name_1 = glue('lag{i}')
fun_name_2 = glue('lag0{i}')
aa %>%
group_by(region) %>%
mutate(across(.cols = contains('mean'),
.fns = setNames(list(~lag(., i),
~ rollmeanr(., i)), c(fun_name_1, fun_name_2)),
.names = '{.col}_{.fn}'))
-output
# A tibble: 4 x 10
# Groups: region [4]
# region co_mean o3_mean pm2.5_mean co_mean_lag1 co_mean_lag01 o3_mean_lag1 o3_mean_lag01 pm2.5_mean_lag1 pm2.5_mean_lag01
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 1 5 5 5 NA 5 NA 5 NA 5
#2 2 5 5 5 NA 5 NA 5 NA 5
#3 3 5 5 5 NA 5 NA 5 NA 5
#4 4 5 5 5 NA 5 NA 5 NA
Could specify the fill = TRUE in rollmean
aa %>%
group_by(region) %>%
mutate(across(.cols = contains('mean'),
.fns = setNames(list(~lag(., i),
~ rollmeanr(., i, fill = TRUE)), c(fun_name_1, fun_name_2)),
.names = '{.col}_{.fn}'))
First I don't think your data should be grouped, at least for the data shared it doesn't make sense to have only 1 row in the group and then calculate lag value and rolling mean on it.
You can have appropriate column names using .names in across and use map_dfc to combine everything into one dataframe.
library(dplyr)
library(purrr)
library(zoo)
map_dfc(1:3, function(x) {
aa %>%
transmute(across(.cols = contains('mean'),
.fns = list(lag = ~lag(., x),
lag0 = ~rollmeanr(., x, fill = NA)),
.names = sprintf('{fn}_{col}_%d', x)))
})
You can add group_by(Region) if you are trying it on some another dataset.
I wish to dynamically select variables and modify them in tibble dataframe. I have copied a sample problem. Can someone please help with that.
Thanks
df <- tibble(
x = c(1, 2, 3),
y = c(4, 5, 6),
z = c(6, 7, 8))
variables = c("x", "y")
for (var in variables)
{
df <- df %>% mutate(var = var + 1)
}
We can use mutate with across
library(dplyr)
df %>%
mutate(across(all_of(variables), ~ . + 1))
-output
# A tibble: 3 x 3
# x y z
# <dbl> <dbl> <dbl>
#1 2 5 6
#2 3 6 7
#3 4 7 8
data
df <- tibble(
x = c(1, 2, 3),
y = c(4, 5, 6),
z = c(6, 7, 8))
variables = c("x", "y")
Try this. You can use !! from rlang and sym() from dplyr to make the evaluation you want using the operator :=. Here the code:
library(dplyr)
#Data and code
df <- tibble(
x = c(1, 2, 3),
y = c(4, 5, 6),
z = c(6, 7, 8))
variables = c("x", "y")
for (var in variables)
{
var <- sym(var)
df <- df %>% mutate(!!var := !!var + 1)
}
Output:
# A tibble: 3 x 3
x y z
<dbl> <dbl> <dbl>
1 2 5 6
2 3 6 7
3 4 7 8
The original dataset contains survey data in long form
Original dataset
T Q1 Q2 Q3
M1 3 5 4
M1 3 1 3
M1 1 3 1
M2 4 4 2
M2 2 2 3
M2 5 5 5
Where T is the type of respondents and Q1--Q3 are the questions, and the cell value corresponds to their agreement level on a 1--5 Likert
scale.
Wanted dataset
T Q A1 A2 A3 A4 A5
M1 Q1 1 0 3 0 0
M2 Q1 0 1 0 1 1
M1 Q2 1 0 1 0 1
M2 Q2 0 1 0 1 1
M1 Q3 1 0 1 1 0
M2 Q3 0 1 1 0 1
Where A1--A5 are the possible answers (1--5 Likert) and the cell value contains the frequency of these answers for each group M1 and M2.
How to get from the Original dataset to the Wanted dataset?
One way would be to use the dplyr and tidyr
library(dplyr)
library(tidyr)
df <- data.frame(Type = c('M1', 'M1', 'M1', 'M2', 'M2', 'M2'),
Q1 = c(3, 3, 1, 4, 2, 5),
Q2 = c(5, 1, 3, 4, 2, 5),
Q3 = c(4, 3, 1, 2, 3, 5))
df %>%
gather(key = 'Q', value = 'A', -Type) %>%
group_by(Type, Q, A) %>%
summarize(Count = n()) %>%
mutate(A = paste0('A', A)) %>%
spread(key = A, value = Count, fill = 0) %>%
arrange(Q, Type)
I used tidyverse fuction to solve your problem. Notice that I had to create row identifiers because not always gather an spread are symmetrics (for more, check this out)
library(tidyverse)
# Data
x <- data.frame(
T = c("M1", "M1", "M1", "M2", "M2", "M2"),
Q1 = c(3, 3, 1, 4, 2, 5),
Q2 = c(5, 1, 3, 4, 2, 5),
Q3 = c(4, 3, 1, 2, 3, 5)
)
# Modification
gather(x, key, A, -T) %>%
group_by(T, key, A) %>%
mutate(row_id = 1:n()) %>%
ungroup() %>%
spread(A, A, fill = 0, sep = "") %>%
select(-row_id)
I am always unsure how to retrieve a summary with dplyr.
Let us suppose I have a summary of individuals and households.
dta = rbind(c(1, 1, 45),
c(1, 2, 47),
c(2, 1, 24),
c(2, 2, 26),
c(3, 1, 67),
c(4, 1, 20),
c(4, 2, 21),
c(5, 3, 7)
)
dta = as.data.frame(dta)
colnames(dta) = c('householdid', 'id', 'age')
householdid id age
1 1 45
1 2 47
2 1 24
2 2 26
3 1 67
4 1 20
4 2 21
4 3 7
Imagine I want to calculate the number of person in the household and the mean age by households and then re-use this information in the original dataset.
dta %>%
group_by(householdid) %>%
summarise( nhouse = n(), meanAgeHouse = mean(age) ) %>%
merge(., dta, all = T)
I am often using merge, but it is slow sometimes when the dataset is huge.
Is it possible to
mutate
instead of
merge ?
dta %>% group_by(householdid) %>% mutate( nhouse = n(), meanAgeHouse = mean(age) )