My data looks like this:
counts <- data.frame(
pos = c(101, 101, 101, 102, 102, 102, 103, 103, 103, 101, 101, 101),
chr = c(1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4),
subj = c("A", "B", "C", "A", "B", "C", "A", "B", "C", "A", "B", "C")
)
pos is supposed to belong to only one unique chr, but here pos 101 belongs to both chr 1 and 4.
I can detect this case like:
counts %>% select(pos, chr) %>%
group_by(pos) %>%
summarise(n_chrs = length(unique(chr))) %>%
filter(n_chrs > 1)
This returns pos which has more than to chr values:
A tibble: 1 x 2
pos n_chrs
<dbl> <int>
1 101 2
What I'd like is to know which chr values are implicated, something like:
pos chr
1 101 1
2 101 4
Thanks!
You could do:
library(dplyr)
counts %>%
group_by(pos) %>%
distinct(chr) %>%
filter(n() > 1)
Output:
# A tibble: 2 x 2
# Groups: pos [1]
pos chr
<dbl> <dbl>
1 101 1
2 101 4
An option using data.table
library(data.table)
unique(setDT(counts), by = 'chr')[, .(chr = chr[.N > 1]), pos]
# pos chr
#1: 101 1
#2: 101 4
Instead of summarize, you could just use mutate to create the group-wise count. This will make sure you keep chr, which you're interested in:
counts %>% select(pos, chr) %>%
group_by(pos) %>%
mutate(n_chrs = length(unique(chr))) %>%
filter(n_chrs > 1) %>%
unique()
Result:
# A tibble: 2 x 3
# Groups: pos [1]
pos chr n_chrs
<dbl> <dbl> <int>
1 101 1 2
2 101 4 2
Related
Suppose I have the following dataframe...
# Starting dataframe
data <- tribble(
~ID, ~Excluded, ~colA, ~colB, ~colC, ~col_mean, ~varA, ~varB,
"A", TRUE, 1, 1, 1, 1, "X", 10,
"B", FALSE, NA, 2, 2, NA, "Y", 20,
"C", FALSE, 3, 3, 3, 3, "Z", 30
)
And a subsetted dataframe (i.e. fewer observations exist) where missing values have been imputed, e.g. ...
# Dataframe with imputed values
data_imputed <- tribble(
~ID, ~Excluded, ~colA, ~colB, ~colC, ~col_mean, ~varA, ~varB,
"B", FALSE, 2, 2, 2, 2, "Y", 20,
"C", FALSE, 3, 3, 3, 3, "Z", 30
)
How do I replace values in the original dataframe with those from the imputed dataframe when a particular column (e.g. col_mean) has a missing value?
Note: I don't want to replace the whole row with the row from the imputed dataframe, just a specified set of columns (e.g., in this case, those starting with "col").
The target dataframe would look like this...
# Target dataframe
data <- tribble(
~ID, ~Excluded, ~colA, ~colB, ~colC, ~col_mean, ~varA, ~varB,
"A", TRUE, 1, 1, 1, 1, "X", 10,
"B", FALSE, 2, 2, 2, 2, "Y", 20,
"C", FALSE, 3, 3, 3, 3, "Z", 30
)
I have tried to summarise the problem with this figure...
I need to do this for four or five sets of columns, so something where I can specify the condition (e.g. is.na(col_mean)) and the columns to use (using regex) would make things easier.
I tend to use tidyverse, so code that works with tidyverse syntax is preferred.
You could use the rows_ family from dplyr. In this case, rows_patch() works well. It modifies existing rows by some key columns (i.e. ID), but only overwrites NA values.
library(dplyr)
rows_patch(data, data_imputed, by = "ID")
# # A tibble: 3 × 8
# ID Excluded colA colB colC col_mean varA varB
# <chr> <lgl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl>
# 1 A TRUE 1 1 1 1 X 10
# 2 B FALSE 2 2 2 2 Y 20
# 3 C FALSE 3 3 3 3 Z 30
left_join(data, data_imputed, by = "ID") %>%
mutate(across(ends_with(".x"), ~ coalesce(., cur_data()[[sub("\\.x$", ".y", cur_column())]]))) %>%
select(-ends_with(".y")) %>%
rename_with(.fn = ~ sub("\\.x$", "", .))
# # A tibble: 3 x 8
# ID Excluded colA colB colC col_mean varA varB
# <chr> <lgl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl>
# 1 A TRUE 1 1 1 1 X 10
# 2 B FALSE 2 2 2 2 Y 20
# 3 C FALSE 3 3 3 3 Z 30
I have a dataframe which looks like this.
Name info.1 info.2
ab a 1
123 a 1
de c 4
456 c 4
fg d 5
789 d 5
The two rows that need to be combined are identical aside from the name column and are together in the dataframe. I want the new dataframe to look like this:
Name ID info.1 info.2
ab 123 a 1
de 456 c 4
fg 789 d 5
I have no clue how to do this and google search hasn't been helpful so far
In base R you could do:
data.frame(Name = df[seq(nrow(df)) %% 2 == 0, 1],
ID = df[seq(nrow(df)) %% 2 == 1, 1],
df[seq(nrow(df)) %% 2 == 0, 2:3])
#> Name ID info.1 info.2
#> 2 ab 456 a 1
#> 4 123 fg c 4
#> 6 de 789 d 5
Created on 2022-07-20 by the reprex package (v2.0.1)
A possible solution:
library(tidyverse)
df %>%
group_by(info.1) %>%
summarise(Name = str_c(Name, collapse = "_"), info.2 = first(info.2)) %>%
separate(Name, into = c("Name", "ID"), convert = T) %>%
relocate(info.1, .before = info.2)
#> # A tibble: 3 × 4
#> Name ID info.1 info.2
#> <chr> <int> <chr> <int>
#> 1 ab 123 a 1
#> 2 de 456 c 4
#> 3 fg 789 d 5
Assuming the Name column is consistently ordered Name-ID-Name-ID then:
library(tidyverse)
data <- tibble(Name = c('ab', 123, 'de', 456, 'fg', 789),
info.1 = c('a', 'a', 'c', 'c', 'd', 'd'),
info.2 = c(1, 1, 4, 4, 5, 5))
# remove the troublesome column and make a tibble
# with the unique combos of info1 and 2
data_2 <- data %>% select(info.1, info.2) %>% distinct()
# add columns for name and ID by skipping every other row in the
# original tibble
data_2$Name <- data$Name[seq(from = 1, to = nrow(data), by = 2)]
data_2$ID <- data$Name[seq(from = 2, to = nrow(data), by = 2)]
We could also use summarise and extract first as name and last as id:
data |>
group_by(info.1, info.2) |>
summarise(name = first(Name), ID = last(Name)) |>
ungroup() #|>
#relocate(3:4,1:2)
Output:
# A tibble: 3 × 4
info.1 info.2 name ID
<chr> <dbl> <chr> <chr>
1 a 1 ab 123
2 c 4 de 456
3 d 5 fg 789
We could also use
library(dplyr)
library(stringr)
data %>%
group_by(across(starts_with('info'))) %>%
mutate(ID = str_subset(Name, "^\\d+$"), .before = 2) %>%
ungroup %>%
filter(str_detect(Name, '^\\d+$', negate = TRUE))
-output
# A tibble: 3 × 4
Name ID info.1 info.2
<chr> <chr> <chr> <dbl>
1 ab 123 a 1
2 de 456 c 4
3 fg 789 d 5
data
data <- structure(list(Name = c("ab", "123", "de", "456", "fg", "789"
), info.1 = c("a", "a", "c", "c", "d", "d"), info.2 = c(1, 1,
4, 4, 5, 5)), row.names = c(NA, -6L), class = "data.frame")
I am trying to aggregate records with a specific type into subsequent records.
I have a dataset similar to the following:
df_initial <- data.frame("Id" = c(1, 2, 3, 4, 5),
"Qty" = c(105, 110, 100, 115, 120),
"Type" = c("A", "B", "B", "A", "A"),
"Difference" = c(30, 34, 32, 30, 34))
After sorting on the Id field, I'd like to aggregate records of Type = "B" into the next record of type = "A".
In other words, I'm looking to create df_new, which adds the Qty and Difference values for Ids 2 and 3 into the Qty and Difference values for Id 4, and flags Id 4 as being adjusted (in the field AdjustedFlag).
df_new <- data.frame("Id" = c(1, 4, 5),
"Qty" = c(105, 325, 120),
"Type" = c("A", "A", "A"),
"Difference" = c(30, 96, 34),
"AdjustedFlag" = c(0, 1, 0))
I'd greatly appreciate any advice or ideas about how to do this in R, preferably using data.table.
A data.table solution:
df_initial[, .(
Id = Id[.N], Qty = sum(Qty),
Difference = sum(Difference),
AdjustedFlag = +(.N > 1)
), by = .(grp = rev(cumsum(rev(Type == "A"))))
][, grp := NULL][]
# Id Qty Difference AdjustedFlag
# <num> <num> <num> <int>
# 1: 1 105 30 0
# 2: 4 325 96 1
# 3: 5 120 34 0
This can be solved by creating a new grouping variable, that groups the rows into the groups you describe, with the idea being to utilize that grouping variable for the desired aggregation.
Instead of having
A B B A A
that new grouping variable should look something like this:
1 2 2 2 3
This is not a data.table solution, but the same logic could be applied there:
library(tidyverse)
df_initial |>
mutate(
type2 = ifelse(Type == "A", as.numeric(factor(Type)), 0),
type2 = cumsum(type2),
type2 = ifelse(Type == "B", NA, type2)
) |>
fill(type2, .direction = "up") |>
group_by(type2) |>
summarise(
id = max(Id),
Qty = sum(Qty),
Difference = sum(Difference),
AdjustedFlag = as.numeric(n() > 1)
)
#> # A tibble: 3 × 5
#> type2 id Qty Difference AdjustedFlag
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 105 30 0
#> 2 2 4 325 96 1
#> 3 3 5 120 34 0
Using tidyverse
df_initial %>%
mutate(gn = if_else(lag(Type, default = 'A') == 'B' | Type == 'B', 'B', Type),
gr = cumsum(lag(gn, default = 'A') != gn),
adjusted = if_else(lag(Type, default = 'A') == 'B' | Type == 'B', 1, 0)) %>%
group_by(gr) %>%
summarise(Id = last(Id),
Qty = sum(Qty),
Type = 'A',
Difference = sum(Difference),
Adjusted_flg = max(adjusted)) %>% ungroup()
Here we create an interim dataset that looks like:
Id Qty Type Difference gn gr Adjusted
1 1 105 A 30 A 0 0
2 2 110 B 34 B 1 0
3 3 100 B 32 B 1 0
4 4 115 A 30 B 1 1
5 5 120 A 34 A 2 0
And use this to create our final table within the summarise. The gr is a column for indicating a group of values, which is why we group_by it.
My dataframe is as below
df <- data.frame(Webpage = c(111, 111, 111, 111, 222, 222),
Dept = c(101, 101, 101, 102, 102, 103),
Emp_Id = c(1, 1, 2, 3, 4, 4),
weights = c(5,5,2,3,4,5))
Webpage Dept Emp_Id weights
111 101 1 5
111 101 1 5
111 101 2 2
111 102 3 3
222 102 4 4
222 103 4 5
I want for each webpage what is the number of employee seen that webpage in terms of their weights and weight percentage.
Unique employee are unique combination of Dept and Emp_ID
For e.g. webpage 111 is seen by Emp_ID 1,2 and 3. So number of employee seen is sum of their weights i.e 5+2+3 =10 and weight percentage is 0.52(10/19). 19 is the total sum of weights of unique employee(which is the unique combination of Dept and Emp_ID)
Webpage Number_people_seen seen_percentage
111 10 0.52
222 9 0.47
What I tried is below but not sure how to get the sum of weights.
library(dplyr)
df %>% group_by(Webpage) %>% distinct(Dept,Emp_Id)
df <- data.frame(Webpage = c(111, 111, 111, 111, 222, 222),
Dept = c(101, 101, 101, 102, 102, 103),
Emp_Id = c(1, 1, 2, 3, 4, 4),
weights = c(5,5,2,3,4,5))
library(tidyverse)
df %>%
group_by(Webpage) %>%
distinct(Dept,Emp_Id, .keep_all = T) %>%
summarise(Number_people_seen = sum(weights)) %>%
mutate(seen_percentage = prop.table(Number_people_seen))
#> `summarise()` ungrouping output (override with `.groups` argument)
#> # A tibble: 2 x 3
#> Webpage Number_people_seen seen_percentage
#> <dbl> <dbl> <dbl>
#> 1 111 10 0.526
#> 2 222 9 0.474
Created on 2021-04-05 by the reprex package (v0.3.0)
df %>% group_by(Webpage, Emp_Id) %>%
summarise(no_of_ppl_seen = unique(weights)) %>%
group_by(Webpage) %>%
summarise(no_of_ppl_seen = sum(no_of_ppl_seen)) %>%
mutate(seen_percentage = no_of_ppl_seen/sum(no_of_ppl_seen))
# A tibble: 2 x 3
Webpage no_of_ppl_seen seen_percentage
<dbl> <dbl> <dbl>
1 111 10 0.526
2 222 9 0.474
OR
df %>% filter(!duplicated(across(everything()))) %>%
group_by(Webpage) %>%
summarise(number_ppl_seen = sum(weights)) %>%
mutate(seen_perc = number_ppl_seen/sum(number_ppl_seen))
I want to conditionally summarize several variables by group. The following code does that, but I'm not sure how to do this without specifying each variable and the conditions in the summarize step.
library(tidyverse)
dat <- data.frame(group = c("A", "A", "A", "B", "B", "B"),
indicator = c(1, 2, 3, 1, 2, 3),
var1 = c(1, 0, 1, 2, 1, 2),
var2 = c(1, 0, 1, 1, 2, 1))
# dat
# group indicator var1 var2
#1 A 1 1 1
#2 A 2 0 0
#3 A 3 1 1
#4 B 1 2 1
#5 B 2 1 2
#6 B 3 2 1
dat %>%
group_by(group) %>%
summarise(var1 = sum(var1[indicator==1 | indicator==2]),
var2 = sum(var2[indicator==1 | indicator==2]))
# A tibble: 2 x 3
# group var1 var2
#* <chr> <dbl> <dbl>
#1 A 1 1
#2 B 3 3
Use across :
library(dplyr)
dat %>%
group_by(group) %>%
summarise(across(starts_with('var'), ~sum(.[indicator %in% 1:2])))
# group var1 var2
#* <chr> <dbl> <dbl>
#1 A 1 1
#2 B 3 3