The data below have an IndID field as well as three columns containing numbers, including NA in some instances, with a varying number of rows for each IndID.
library(dplyr)
n = 10
set.seed(123)
dat <- data.frame(IndID = sample(c("AAA", "BBB", "CCC", "DDD"), n, replace = T),
Num1 = c(2,4,2,4,4,1,3,4,3,2),
Num2 = sample(c(1,2,5,8,7,8,NA), n, replace = T),
Num3 = sample(c(NA, NA,NA,8,7,9,NA), n, replace = T)) %>%
arrange(IndID)
head(dat)
IndID Num1 Num2 Num3
1 AAA 1 NA 7
2 BBB 2 NA NA
3 BBB 2 7 7
4 BBB 2 NA NA
5 CCC 3 2 8
6 CCC 3 5 NA
For each IndID, I would like to make a new column Max that contains the maximum value for Num1:Num3. In most instances this involves finding the max value across multiple rows and columns. Within dplyr I am missing the final step (below) and would appreciate any suggestions.
dat %>%
group_by(IndID) %>%
mutate(Max = "???")
An option is pmax to get the rowwise maxs
dat %>%
mutate(Max = pmax(Num1, Num2, Num3, na.rm = TRUE))
If there are many columns, we can get the column names, convert it to symbol and then evaluate (!!!)
dat %>%
mutate(Max = pmax(!!! rlang::syms(names(.)[-1]), na.rm = TRUE))
# A tibble: 10 x 5
# Groups: IndID [4]
# IndID Num1 Num2 Num3 Max
# <fct> <dbl> <dbl> <dbl> <dbl>
# 1 AAA 1 NA 7 7
# 2 BBB 2 NA NA 2
# 3 BBB 2 7 7 7
# 4 BBB 2 NA NA 2
# 5 CCC 3 2 8 8
# 6 CCC 3 5 NA 5
# 7 DDD 4 8 7 8
# 8 DDD 4 7 NA 7
# 9 DDD 4 1 7 7
#10 DDD 4 1 7 7
If this is to get the max of all 'Num' column grouped by 'IndID', there are multiple ways.
1) From the above step, we can extend it to group by 'IndID' and then take the max of row maxs ('Max')
dat %>%
mutate(Max = pmax(!!! rlang::syms(names(.)[-1]), na.rm = TRUE)) %>%
group_by(IndID) %>%
mutate(Max = max(Max))
2) Another option is to convert the 'wide' format to 'long' with gather, then grouped by 'IndID', get the max of 'val' column and right_join with the original dataset
library(tidyverse)
gather(dat, key, val, -IndID) %>%
group_by(IndID) %>%
summarise(Max = max(val,na.rm = TRUE)) %>%
right_join(dat)
3) Or another option without reshaping into 'long' format would be to nest the dataset after grouping by 'IndID', unlist and get the max of the 'Num' columns
dat %>%
group_by(IndID) %>%
nest %>%
mutate(data = map(data, ~ .x %>%
mutate(Max = max(unlist(.), na.rm = TRUE)))) %>%
unnest
Related
I have a very simple case where I want to combine several data frames into one based on a common id elements of a particular data frame.
Example:
id <- c(1, 2, 3)
x <- c(10, 12, 14)
data1 <- data.frame(id, x)
id <- c(2, 3)
x <- c(20, 22)
data2 <- data.frame(id, x)
id <- c(1, 3)
x <- c(30, 32)
data3 <- data.frame(id, x)
Which gives us,
$data1
id x
1 1 10
2 2 12
3 3 14
$data2
id x
1 2 20
2 3 22
$data3
id x
1 1 30
2 3 32
Now, I want to combine all three data frames based on the id's of the data3. The expected output should look like
> comb
id x
1 1 10
2 1 NA
3 1 30
4 3 14
5 3 22
6 3 32
I am trying the following, but not getting the expected output.
library(dplyr)
library(tidyr)
combined <- bind_rows(data1, data2, data3, .id = "id") %>% arrange(id)
Any idea how to get the expected output?
Does this work:
library(dplyr)
library(tidyr)
data1 %>% full_join(data2, by = 'id') %>% full_join(data3, by = 'id') %>% arrange(id) %>% right_join(data3, by = 'id') %>%
pivot_longer(cols = -id) %>% select(-name) %>% distinct()
# A tibble: 6 x 2
id value
<dbl> <dbl>
1 1 10
2 1 NA
3 1 30
4 3 14
5 3 22
6 3 32
Combine the 3 dataframes in one list and use filter to select only the id's in 3rd dataframe.
library(dplyr)
library(tidyr)
bind_rows(data1, data2, data3, .id = "new_id") %>%
filter(id %in% id[new_id == 3]) %>%
complete(new_id, id)
# new_id id x
# <chr> <dbl> <dbl>
#1 1 1 10
#2 1 3 14
#3 2 1 NA
#4 2 3 22
#5 3 1 30
#6 3 3 32
A pure base R solution can also make it
lst <- list(data1, data2, data3)
reshape(
subset(
reshape(
do.call(rbind, Map(cbind, lst, grp = seq_along(lst))),
idvar = "id",
timevar = "grp",
direction = "wide"
),
id %in% lst[[3]]$id
),
idvar = "id",
varying = -1,
direction = "long"
)[c("id", "x")]
which gives
id x
1.1 1 10
3.1 3 14
1.2 1 NA
3.2 3 22
1.3 1 30
3.3 3 32
>
Using base R
do.call(rbind, unname(lapply(mget(ls(pattern = "^data\\d+$")), \(x) {
x1 <- subset(x, id %in% data3$id)
v1 <- setdiff(data3$id, x1$id)
if(length(v1) > 0) rbind(x1, cbind(id = v1, x = NA)) else x1
})))
-output
id x
1 1 10
3 3 14
2 3 22
11 1 NA
12 1 30
21 3 32
bind_rows(data1, data2, data3, .id = 'grp')%>%
complete(id, grp)%>%
select(-grp) %>%
filter(id%in%data3$id)
# A tibble: 6 x 2
id x
<dbl> <dbl>
1 1 10
2 1 NA
3 1 30
4 3 14
5 3 22
6 3 32
In the example below how can I calculate the row mean when column A is NA? The row mean would replace the NA in column A. Using base R, I can use this:
foo <- tibble(A = c(3,5,NA,6,NA,7,NA),
B = c(4,5,4,5,6,4,NA),
C = c(6,5,2,8,8,5,NA))
foo
tmp <- rowMeans(foo[,-1],na.rm = TRUE)
foo$A[is.na(foo$A)] <- tmp[is.na(foo$A)]
foo$A[is.nan(foo$A)] <- NA
Curious how I might do this with dplyR?
You can use ifelse :
library(dplyr)
foo %>%
mutate(A = ifelse(is.na(A), rowMeans(., na.rm = TRUE), A),
A = replace(A, is.nan(A), NA))
# A B C
# <dbl> <dbl> <dbl>
#1 3 4 6
#2 5 5 5
#3 3 4 2
#4 6 5 8
#5 7 6 8
#6 7 4 5
#7 NA NA NA
Here is a solution that not only replace NA in column A, but for all columns in the data frame.
library(dplyr)
foo2 <- foo %>%
mutate(RowMean = rowMeans(., na.rm = TRUE)) %>%
mutate(across(-RowMean, .fns =
function(x) ifelse(is.na(x) & !is.nan(RowMean), RowMean, x))) %>%
select(-RowMean)
Use coalesce:
foo %>%
mutate(m = rowMeans(across(), na.rm = T),
A = if_else(is.na(A) & !is.na(m), m, A)) %>%
select(-m)
# # A tibble: 7 x 3
# A B C
# <dbl> <dbl> <dbl>
# 1 3 4 6
# 2 5 5 5
# 3 3 4 2
# 4 6 5 8
# 5 7 6 8
# 6 7 4 5
# 7 NA NA NA
I have a number of large data frames which has the occasional string value and I would like to know what the unique string values are (ignoring the numeric values) and if possible count these strings.
df <- data.frame(1:16)
df$A <- c("Name",0,0,0,0,0,12,12,0,14,NA_real_,14,NA_real_,NA_real_,16,16)
df$B <- c(10,0,"test",0,12,12,12,12,0,14,NA_real_,14,16,16,16,16)
df$C <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14,16)
X1.16 A B C
1 1 Name 10 10
2 2 0 0 12
3 3 0 test 14
4 4 0 0 16
5 5 0 12 10
6 6 0 12 12
7 7 12 12 14
8 8 12 12 16
9 9 0 0 10
10 10 14 14 12
11 11 <NA> <NA> 14
12 12 14 14 16
13 13 <NA> 16 10
14 14 <NA> 16 12
15 15 16 16 14
16 16 16 16 16
I know I can use the count function in dplyr but I have too many unique numeric values so this is not a great solution. In the code below I was able to filter my data so to only retain rows that contain an alphabetical character (although this isn't a solution either).
df %>% filter_all(any_vars(str_detect(., pattern = "[:alpha:]")))
X1.16 A B C
1 1 Name 10 10
2 3 0 test 14
My desired output would be something to the effect of:
Variable n
"Name" 1
"test" 1
You can get the string value with grep and count them using table :
stack(table(grep('[[:alpha:]]', unlist(df), value = TRUE)))[2:1]
If you want a tidyverse answer you can get the data in long format, keep only the rows with characters in it and count them.
library(dplyr)
df %>%
mutate(across(.fns = as.character)) %>%
tidyr::pivot_longer(cols = everything()) %>%
filter(grepl('[[:alpha:]]', value)) %>%
count(value)
# value n
# <chr> <int>
#1 Name 1
#2 test 1
#Ronak and #akrun above beat me to the punch, my solution is very similar - with an extension if you want a count within columns
# Coerce to tibble for ease of reading
df <- df %>%
as_tibble() %>%
mutate(across(.fns = as.character))
df %>%
pivot_longer(cols = everything()) %>%
summarise(Variable = str_subset(value, "[:alpha:]")) %>%
count(Variable, sort = TRUE)
# A tibble: 2 x 2
Variable n
<chr> <int>
1 Name 1
2 test 1
# str_subset is a convenient wrapper around filter & str_detect
Add some extra words to test
# Test on extra word counts - replace 12 and 14 with words
df2 <- df
df2[df2 == 12] <- 'Name'
df2[df2 == 14] <- 'test'
df2
df2 %>%
pivot_longer(cols = everything()) %>%
summarise(Variable = str_subset(value, "[:alpha:]")) %>%
count(Variable, sort = TRUE)
# A tibble: 2 x 2
Variable n
<chr> <int>
1 Name 12
2 test 10
If you want counts by column
df2 %>%
select(-1) %>%
pivot_longer(everything(), names_to = 'col') %>%
group_by(col) %>%
summarise(Variable = str_subset(value, "[:alpha:]")) %>%
count(col, Variable)
# A tibble: 6 x 3
# Groups: col [3]
col Variable n
<chr> <chr> <int>
1 A Name 3
2 A test 2
3 B Name 4
4 B test 3
5 C Name 4
6 C test 4
We can use filter with across
library(dplyr)
library(tidyr)
library(stringr)
library(purrr)
df %>%
select(-1) %>%
mutate(across(everything(), as.character)) %>%
filter(across(everything(), ~ str_detect(., '[:alpha:]')) %>% reduce(`|`)) %>%
pivot_longer(everything()) %>%
filter(str_detect(value, '[:alpha:]')) %>%
count(value)
# A tibble: 2 x 2
# value n
# <chr> <int>
#1 Name 1
#2 test 1
IS there a way to transpose and summing distinct values in R For example
df
Cola Order Quantity Loc
ABC 1 4 LocA
ABC 1 4 LocB
CSD 4 6 LocA
CDS 3 2 LocB
We have same values for Order and Quantity but still need to take sum of it.
Expected Output (Transpose with respect to Quantity)
Cola Order Quantity LocA_Quantity Loc B_Quantity
ABC 2 8 4 4
CSD 4 6 6
CDS 3 2 2
Create the dataset:
library(tibble)
df = tribble(
~Cola, ~Order, ~Quantity, ~Loc,
'ABC', 1, 4, 'LocA',
'ABC', 1, 4, 'LocB',
'CSD', 4, 6, 'LocA',
'CDS', 3, 2, 'LocB'
)
Create the summaries:
library(dplyr)
df %>%
group_by(Cola) %>%
summarise(
Order = sum(Order),
LocA_Quantity = sum(Quantity * if_else(Loc == "LocA", 1, 0)),
LocB_Quantity = sum(Quantity * if_else(Loc == "LocB", 1, 0)),
Quantity = sum(Quantity)
)
You can do it for both Quantity and order and drop columns you dont want at the end, i.e.
library(tidyverse)
df %>%
group_by(Cola) %>%
mutate_at(vars(2:3), list(new = sum)) %>%
pivot_wider(names_from = Loc, values_from = 2:3)
## A tibble: 3 x 7
## Groups: Cola [3]
# Cola Order_new Quantity_new Order_LocA Order_LocB Quantity_LocA Quantity_LocB
# <fct> <int> <int> <int> <int> <int> <int>
#1 ABC 2 8 1 1 4 4
#2 CSD 4 6 4 NA 6 NA
#3 CDS 3 2 NA 3 NA 2
1) dplyr/tidyr Using the data shown reproducibly in the Note at the end, sum the orders and quantity and create a Quantity_ column equal to Quantity by Cola. Then reshape the Quantity_ column to wide form.
library(dplyr)
library(tidyr)
df %>%
group_by(Cola) %>%
mutate(Quantity_ = Quantity,
Order = sum(Order),
Quantity = sum(Quantity)) %>%
ungroup %>%
pivot_wider(names_from = "Loc", values_from = "Quantity_",
names_prefix = "Quantity_", values_fill = list(Quantity_ = 0))
giving:
# A tibble: 3 x 5
Cola Order Quantity Quantity_LocA Quantity_LocB
<chr> <int> <int> <int> <int>
1 ABC 2 8 4 4
2 CSD 4 6 6 0
3 CDS 3 2 0 2
2) Base R We can do much the same in base R using transform/ave and reshape like this:
df2 <- transform(df,
Quantity_ = Quantity,
Quantity = ave(Quantity, Cola, FUN = sum),
Order = ave(Order, Cola, FUN = sum))
wide <- reshape(df2, dir = "wide", idvar = c("Cola", "Quantity", "Order"),
timevar = "Loc", sep = "")
wide
## Cola Order Quantity Quantity_LocA Quantity_LocB
## 1 ABC 2 8 4 4
## 3 CSD 4 6 6 NA
## 4 CDS 3 2 NA 2
Note
Lines <- "Cola Order Quantity Loc
ABC 1 4 LocA
ABC 1 4 LocB
CSD 4 6 LocA
CDS 3 2 LocB"
df <- read.table(text = Lines, header = TRUE, as.is = TRUE)
I want to merge two tables and only keep similar Task from two tables. Non-common Task is removed. If two Taskis similar, I only keep smaller value,
Two tables like this
x<-data.frame("Task"=c("A","B","C","D","E"),"FC"=c(12,NA,15,14,NA),FH=c(13,15,NA,17,20))
Task FC FH
1 A 12 13
2 B NA 15
3 C 15 NA
4 D 14 17
5 E NA 20
y<-data.frame("Task"=c("B","C","F","G"),"FC"=c(NA,12,20,NA),FH=c(NA,17,18,NA))
Task FC FH
1 B NA NA
2 C 12 17
3 F 20 18
4 G NA NA
I want an output like this
Task FC FH
2 B NA 15
3 C 12 17
One dplyr possibility could be:
x %>%
bind_rows(y) %>%
group_by(Task) %>%
filter(n() > 1) %>%
summarise_all(~ ifelse(all(is.na(.)), NA, min(., na.rm = TRUE)))
Task FC FH
<chr> <dbl> <dbl>
1 B NA 15
2 C 12 17
Or if there could be duplicate tasks per single df:
x %>%
bind_rows(y, .id = "ID") %>%
group_by(Task) %>%
filter(n() > 1 & n_distinct(ID) > 1) %>%
summarise_all(~ ifelse(all(is.na(.)), NA, min(., na.rm = TRUE))) %>%
select(-ID)
You can also do:
# Perform a join
merged <- merge(x = x, y = y, by = "Task")
# Get the minimum value out of two comparable columns
merged$FC <- with(merged, pmin(FC.x, FC.y, na.rm = TRUE))
merged$FH <- with(merged, pmin(FH.x, FH.y, na.rm = TRUE))
# Delete the unwanted columns appearing out of merge
merged <- merged[-c(2:5)]
Output:
Task FC FH
1 B NA 15
2 C 12 17