I would like to divide the values from df1 over the values from df2. In this reproducible example, I am able to sum these values. What about the division? Thanks in advance!
df1 <- data.frame(country = c("a", "b", "c"), year1 = c(1, 2, 3), year2 = c(1, 2, 3))
df2 <- data.frame(country = c("a", "b", "d"), year1 = c(1, 2, NA), year2 = c(1, 2, 3))
df3 <- bind_rows(df1, df2) %>%
mutate_if(is.numeric, tidyr::replace_na, 0) %>%
group_by(country) %>%
summarise_all(., sum, na.rm = TRUE) %>%
na_if(., 0)
Expected result is:
# A tibble: 4 x 3
country year1 year2
<chr> <dbl> <dbl>
1 a 1 1
2 b 1 1
3 c NA NA
4 d NA NA
As there are groups with 2 rows and some with 1, use an if/else condition within summarise/across to divide the first element by the last if there are two elements or else return NA
library(dplyr) # version 1.0.4
library(tidyr)
bind_rows(df1, df2) %>%
mutate(across(where(is.numeric), replace_na, 0)) %>%
group_by(country) %>%
summarise(across(everything(), ~ if(n() == 2) first(.)/last(.)
else NA_real_))
-output
# A tibble: 4 x 3
# country year1 year2
#* <chr> <dbl> <dbl>
#1 a 1 1
#2 b 1 1
#3 c NA NA
#4 d NA NA
Here is a base R option using merge + split.default
df <- merge(df1, df2, by = "country", all = TRUE)
cbind(
df[1],
list2DF(lapply(
split.default(df[-1], gsub("\\.(x|y)", "", names(df)[-1])),
function(v) do.call("/", v)
))
)
which gives
country year1 year2
1 a 1 1
2 b 1 1
3 c NA NA
4 d NA NA
Related
Here is my dataframe:
DF <- data.frame(
VAR1 = c("A", "A", "B", "B", "B", "C", "C"),
VAR2 = c("B", "C", "A", "D", "C", "B", "D"),
VAR3 = c(1, 1, 1, 2, 4, 6, 4)
)
I would like to have this:
VAR1 VAR2 VAR3
A B 2
A C 1
B D 2
B C 10
C D 4
If There is two rows like (VAR1=A, VAR2=B, VAR3=X) and (VAR2=B, VAR1=A, VAR3=Y), I want to have one row like this one (VAR1=A, VAR2=B, VAR3=X+Y). So if the two first variables are "inverse", I would like to have one row with the sum of them.
I tried to have a column which says "Yes" if two rows have inverse values but I can´t find a way to do it.
My code:
DF <- DF %>%
mutate(VAR4 = case_when(VAR2 %in% DF$VAR1 &
VAR1 %in%
(DF %>%
filter(VAR1 == VAR2) %>%
pull(VAR2)
) ~ "Yes",
TRUE ~ 'No' ))
`
This is the result:
VAR1 VAR2 VAR3 VAR4
A B 1 No
A C 1 No
B A 1 No
B D 2 No
B C 4 No
C B 6 No
C D 4 No
My code doesn´t work because my filter doesn´t take the result of VAR2 %in% DF$VAR1 in account.
Does someone have an idea?
You can sort first with apply, and then summarise:
DF[1:2] <- t(apply(DF[1:2], 1, sort))
DF %>%
group_by(VAR1, VAR2) %>%
summarise(VAR3 = sum(VAR3))
# A tibble: 5 × 3
# Groups: VAR1 [3]
VAR1 VAR2 VAR3
<chr> <chr> <dbl>
1 A B 2
2 A C 1
3 B C 10
4 B D 2
5 C D 4
Or, in single pipe:
DF %>%
mutate(VAR = pmap(., ~ sort(c(..1, ..2)) %>%
set_names(., c("VAR1", "VAR2")))) %>%
group_by(VAR) %>%
summarise(VAR3 = sum(VAR3)) %>%
unnest_wider(VAR)
You could try:
library(dplyr)
DF %>%
mutate(across(VAR1:VAR2, as.character)) %>%
group_by(idx1 = pmin(VAR1, VAR2), idx2 = pmax(VAR1, VAR2)) %>%
summarise(VAR3 = sum(VAR3)) %>%
rename_with(~ sub('idx', 'VAR', .)) %>%
ungroup
Output:
# A tibble: 5 x 3
VAR1 VAR2 VAR3
<chr> <chr> <dbl>
1 A B 2
2 A C 1
3 B C 10
4 B D 2
5 C D 4
I have the following dataframe:
var1 <- c("a", "b", "c", "d", "e")
var2 <- c(5, 10, NA, NA, NA)
df <- data.frame (var1, var2)
df
# A tibble: 5 × 2
var1 var2
<chr> <dbl>
1 a 5
2 b 10
3 c NA
4 d NA
5 e NA
I would like to count and merge the NA rows. Expected output:
# A tibble: 3 × 2
var1 var2
<chr> <dbl>
1 a 5
2 b 10
3 x 3
I have tried aggregate(data=df,var2~.,na.rm = FALSE, FUN = sum) but it only returns the results for a and b.
Thank you in advance
With dplyr
df %>%
mutate(var1 = ifelse(is.na(var2), "x", var1),
var2 = ifelse(var1=="x", sum(is.na(var2) & var1 == "x"),
var2)
) %>%
unique()
var1 var2
1 a 1
2 b 2
3 c 3
4 x 2
Data
df <- structure(list(var1 = c("a", "b", "c", "d", "e"), var2 = c(1,
2, 3, NA, NA)), class = "data.frame", row.names = c(NA, -5L))
Try this with Base R
s <- df[complete.cases(df) , ]
s[nrow(s)+1 ,] <- c("x" , sum(is.na(df$var2) == T))
s
output
var1 var2
1 a 5
2 b 10
3 x 3
Using aggregate:
df$var1 <- ifelse(is.na(df$var2), "x", df$var1)
aggregate(data = df, var2 ~ .,
FUN = \(x) if (!all(is.na(x))) sum(x) else length(x), na.action = NULL)
#> var1 var2
#> 1 a 5
#> 2 b 10
#> 3 x 3
Another approach with dplyr
df %>%
mutate(var1 = ifelse(is.na(var2), "x", var1),
var2 = ifelse(is.na(var2), 1, var2)) %>%
count(var1, wt=var2, name="var2")
Is there a way to melt 2 columns and take there sums as value . For example
df <- data.frame(A = c("x", "y", "z"), B = c(1, 2, 3), Cat1 = c(1, 4, 3), New2 = c(4, 4, 4))
Expected output
New_Col Sum
Cat1 8
New2 12
Or using base R with colSums after selecting the columns of interest and then convert the named vector to data.frame with stack
stack(colSums(df[c("Cat1", "New2")]))[2:1]
ind values
1 Cat1 8
2 New2 12
Of course
df %>%
summarise(across(starts_with('Cat'), sum)) %>%
pivot_longer(everything(), names_to = 'New_Col', values_to = 'Sum')
# A tibble: 2 × 2
New_Col Sum
<chr> <dbl>
1 Cat1 8
2 Cat2 12
I have the above 2 dataframes in R,
df1 = [a,2 df2 = [a,10
b,3] c,2]
I want to add those 2 df, so the output can be
df = [a, 12,
b, 3,
c, 2]
Any advice would be much appreciated, thanks!
We can rbind the two datasets and do a group by sum
aggregate(col2 ~ col1, rbind(df1, df2), sum)
-output
# col1 col2
#1 a 12
#2 b 3
#3 c 2
Or in dplyr
library(dplyr)
bind_rows(df1, df2) %>%
group_by(col1) %>%
summarise(col2 = sum(col2), .groups = 'drop')
-output
# A tibble: 3 x 2
# col1 col2
# <chr> <dbl>
#1 a 12
#2 b 3
#3 c 2
data
df2 <- data.frame(col1 = c('a', 'c'), col2 = c(10, 2))
df1 <- data.frame(col1 = c('a', 'b'), col2 = c(2, 3))
I am processing a large dataset adapted to my research. Suppose that I have 4 observations (records) and 5 columns as follows:
x <- data.frame("ID" = c(1, 2, 3, 4),
"group1" = c("A", NA, "B", NA),
"group2" = c("B", "A", NA, "C"),
"hours1" = c(3, NA, 5, NA),
"hours2" = c(1, 2, NA, 5))
> x
ID group1 group2 hours1 hours2
1 A B 3 1
2 <NA> A NA 2
3 B <NA> 5 NA
4 <NA> C NA 5
The "group1" and "group2" are reference columns containing the character values of A, B, and C, and the last two columns, "hours1" and "hours2," are numeric indicating hours obviously.
The column "group1" is corresponding to the column "hours1"; likewise, "group2" is corresponding to "hours 2."
I want to create multiple columns according to the values, A, B, and C, of the reference columns matching to values of "hours1" and "hours2" as follows:
ID group1 group2 hours1 hours2 A B C
1 A B 3 1 3 1 NA
2 <NA> A NA 2 2 NA NA
3 B <NA> 5 NA NA 5 NA
4 <NA> C NA 5 NA NA 5
For example, ID 1 has A in "group1," corresponding to 3 in "hours1" which is found under the column "A." ID 3 has B in "group1," corresponding to 5 in "hours1" which is found under the columns "B." In "group 2," ID 4 has C, corresponding to 5 in hours2 which is found under column "C."
Is there a way to do it using R?
One way would be to combine all the "hour" column in one column and "group" columns in another column. This can be done using pivot_longer. After that we can get data in wide format and join it with original data.
library(dplyr)
library(tidyr)
x %>%
pivot_longer(cols = -ID,
names_to = c('.value'),
names_pattern = '(.*?)\\d+',
values_drop_na = TRUE) %>%
pivot_wider(names_from = group, values_from = hours) %>%
left_join(x, by = 'ID') %>%
select(ID, starts_with('group'), starts_with('hour'), everything())
# A tibble: 4 x 8
# ID group1 group2 hours1 hours2 A B C
# <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 1 A B 3 1 3 1 NA
#2 2 NA A NA 2 2 NA NA
#3 3 B NA 5 NA NA 5 NA
#4 4 NA C NA 5 NA NA 5
For OP's dataset we can slightly modify the code to achieve the desired result.
zz %>%
pivot_longer(cols = -id,
names_to = c('.value'),
names_pattern = '(.*)_',
values_drop_na = TRUE) %>%
arrange(fu2a) %>%
pivot_wider(names_from = fu2a, values_from = fu2b) %>%
left_join(zz, by = 'id') %>%
select(id, starts_with('fu2a'), starts_with('fu2b'), everything())
Another approach using dplyr could be done separating group and hours variables to compute the desired variables and then merge with the original x:
library(tidyverse)
#Data
x <- data.frame("ID" = c(1, 2, 3, 4),
"group1" = c("A", NA, "B", NA),
"group2" = c("B", "A", NA, "C"),
"hours1" = c(3, NA, 5, NA),
"hours2" = c(1, 2, NA, 5),stringsAsFactors = F)
#Reshape
x %>%
left_join(x %>% select(1:3) %>%
pivot_longer(cols = -ID) %>%
group_by(ID) %>% mutate(id=1:n()) %>%
left_join(x %>% select(c(1,4:5)) %>%
pivot_longer(cols = -ID) %>%
rename(name2=name,value2=value) %>%
group_by(ID) %>% mutate(id=1:n())) %>%
filter(!is.na(value)) %>% select(ID,value,value2) %>%
pivot_wider(names_from = value,values_from=value2))
Output:
ID group1 group2 hours1 hours2 A B C
1 1 A B 3 1 3 1 NA
2 2 <NA> A NA 2 2 NA NA
3 3 B <NA> 5 NA NA 5 NA
4 4 <NA> C NA 5 NA NA 5