New to R, my apologies if there is an easy answer that I don't know of.
I have a dataframe with 127.124 observations and 5 variables
Head(SortedDF)
number Retention.time..min. Charge m.z Group
102864 6947 12.58028 5 375.0021 Pro
68971 60641 23.36693 2 375.1373 Pro
75001 104156 24.54187 3 375.1540 Pro
87435 146322 22.69630 3 375.1540 Pro
82658 88256 22.32042 3 375.1541 Pro
113553 97971 14.54600 3 375.1566 Pro
...
I want to compare every row with the row underneath it (so basically rownumber vs rownumber +1) and see if they match. After reading the For and if-else functions, I came up with this code:
for (i in 1:dim(SortedDF))
if(abs(m.z[i]-m.z[i+1])<0.01 | abs(Retention.time..min.[i]-Retention.time..min.[i+1])<1 | (Charge[i]=Charge[i+1]) | Group[i]!=Group[i+1])
print("Match")
else
print("No match")
However, this code does not work as it only prints out the first function function [1], and I'm not sure if i+1 is a thing. Is there any way to solve this not using i+1?
library(tidyverse)
data <- tibble(x = c(1, 1, 2), y = "a")
data
#> # A tibble: 3 × 2
#> x y
#> <dbl> <chr>
#> 1 1 a
#> 2 1 a
#> 3 2 a
same_rows <-
data %>%
# consider all columns
unite(col = "all") %>%
transmute(same_as_next_row = all == lead(all))
data %>%
bind_cols(same_rows)
#> # A tibble: 3 × 3
#> x y same_as_next_row
#> <dbl> <chr> <lgl>
#> 1 1 a TRUE
#> 2 1 a FALSE
#> 3 2 a NA
Created on 2022-03-30 by the reprex package (v2.0.0)
library(tidyverse)
data <- tibble::tribble(
~id, ~number, ~Retention.time..min., ~Charge, ~m.z, ~Group,
102864, 6947, 12.58028, 5, 375.0021, "Pro",
68971, 60641, 23.36693, 2, 375.1373, "Pro",
75001, 104156, 24.54187, 3, 375.1540, "Pro",
87435, 146322, 22.69630, 3, 375.1540, "Pro",
82658, 88256, 22.32042, 3, 375.1541, "Pro",
113553, 97971, 14.54600, 3, 375.1566, "Pro"
)
data %>%
mutate(
matches_with_next_row = (abs(m.z - lead(m.z)) < 0.01) |
(abs(Retention.time..min. - lead(Retention.time..min.)) < 1)
)
#> # A tibble: 6 × 7
#> id number Retention.time..min. Charge m.z Group matches_with_next_row
#> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <lgl>
#> 1 102864 6947 12.6 5 375. Pro FALSE
#> 2 68971 60641 23.4 2 375. Pro FALSE
#> 3 75001 104156 24.5 3 375. Pro TRUE
#> 4 87435 146322 22.7 3 375. Pro TRUE
#> 5 82658 88256 22.3 3 375. Pro TRUE
#> 6 113553 97971 14.5 3 375. Pro NA
Created on 2022-03-30 by the reprex package (v2.0.0)
Related
I want to filter my grouped dataframe based on the number of occurrences of a specific value within a group.
Some exemplary data:
data <- data.frame(ID = sample(c("A","B","C","D"),100,replace = T),
rt = runif(100,0.2,1),
lapse = sample(1:2,100,replace = T))
The “lapse” column is my filter variable in this case.
I want to exclude every “ID” group that has more than 15 counts of “lapse” == 2 within!
data %>% group_by(ID) %>% count(lapse == 2)
So, if for example the group “A” has 17 times “lapse” == 2 within it should be filtered entirely from the datafame.
First I created some reproducible data using a set.seed and check the number of values per group. It seems that in this case only group D more values with lapse 2 has. You can use filter and sum the values with lapse 2 per group like this:
set.seed(7)
data <- data.frame(ID = sample(c("A","B","C","D"),100,replace = T),
rt = runif(100,0.2,1),
lapse = sample(1:2,100,replace = T))
library(dplyr)
# Check n values per group
data %>%
group_by(ID, lapse) %>%
summarise(n = n())
#> # A tibble: 8 × 3
#> # Groups: ID [4]
#> ID lapse n
#> <chr> <int> <int>
#> 1 A 1 8
#> 2 A 2 7
#> 3 B 1 13
#> 4 B 2 15
#> 5 C 1 18
#> 6 C 2 6
#> 7 D 1 17
#> 8 D 2 16
data %>%
group_by(ID) %>%
filter(!(sum(lapse == 2) > 15))
#> # A tibble: 67 × 3
#> # Groups: ID [3]
#> ID rt lapse
#> <chr> <dbl> <int>
#> 1 B 0.517 2
#> 2 C 0.589 1
#> 3 C 0.598 2
#> 4 C 0.715 1
#> 5 B 0.475 2
#> 6 C 0.965 1
#> 7 B 0.234 1
#> 8 B 0.812 2
#> 9 C 0.517 1
#> 10 B 0.700 1
#> # … with 57 more rows
Created on 2023-01-08 with reprex v2.0.2
how could I add a column in a tibble which is in a list?
The data looks like the following and I just want to add a column at the end which checks if column 2 is the same as 3
[[1]]
# A tibble: 620 x 4
DataName `Technischer Platz...4` `Technischer Platz...5` `Technischer Platz...307`
<chr> <chr> <chr> <lgl>
1 WrW_SAPP17_VIAUFKAFVC 1202362-001 1202362-001 NA
2 WrW_SAPP17_VIAUFKAFVC 1202362-002 037 1202362-002 037 NA
3 WrW_SAPP17_VIAUFKAFVC 1202362-011 1202362-011 NA
4 WrW_SAPP17_VIAUFKAFVC 1202362 1202362 NA
5 WrW_SAPP17_VIAUFKAFVC 1202362-004 052 1202362-004 052 NA
6 WrW_SAPP17_VIAUFKAFVC 1202362-018 005 1202362-018 005 NA
7 WrW_SAPP17_VIAUFKAFVC 1202362-017 1202362-017 NA
8 WrW_SAPP17_VIAUFKAFVC 1202362-012 1202362-012 NA
9 WrW_SAPP17_VIAUFKAFVC 1202362-002 002 1202362-002 002 NA
10 WrW_SAPP17_VIAUFKAFVC 1202362-002 039 1202362-002 039 NA
We can use :
library(dplyr)
library(purrr)
list_df <- map(list_df, ~.x %>%
mutate(is_2_same_as_3 = `Technischer Platz...4` == `Technischer Platz...5`))
If you want to do it based on position of columns use :
list_df <- map(list_df, ~.x %>%
mutate(is_2_same_as_3 = .[[2]] == .[[3]]))
Same in base R would be using lapply :
list_df <- lapply(list_df, function(x) transform(x, is_2_same_as_3 = `Technischer Platz...4` == `Technischer Platz...5`))
list_df <- lapply(list_df, function(x) transform(x, is_2_same_as_3 = x[[2]] == x[[3]]))
Let's use a simpler example
library(tidyverse)
honda <- tibble(brand = "HONDA",
disp = c(3, 8, 20),
hp = c(3, 90, 115))
volvo <- tibble(brand = "VOLVO",
disp = c(3, 8, 20),
hp = c(3, 34, 115))
list_df <- list(honda, volvo)
list_df %>% map(~ mutate(., check = disp == hp))
#> [[1]]
#> # A tibble: 3 x 4
#> brand disp hp check
#> <chr> <dbl> <dbl> <lgl>
#> 1 HONDA 3 3 TRUE
#> 2 HONDA 8 90 FALSE
#> 3 HONDA 20 115 FALSE
#>
#> [[2]]
#> # A tibble: 3 x 4
#> brand disp hp check
#> <chr> <dbl> <dbl> <lgl>
#> 1 VOLVO 3 3 TRUE
#> 2 VOLVO 8 34 FALSE
#> 3 VOLVO 20 115 FALSE
Created on 2021-02-18 by the reprex package (v1.0.0)
I have a number of large data frames that have the following basic format, where the final two rows are a mean (d) and standard deviation (e) - although these are calculated elsewhere.
a b c
a 4 3 4
b 3 2 6
c 2 1 8
d 3 2 6
e 1 1 2
I would like to create an iterative function that converts each raw data point into a z-score via the mean and sd value in d and e per column. The formula I would like to apply is ((x-mean)/SD).
The result would be the following:
a b c
a 1 1 1
b 0 0 0
c -1 -1 -1
I don't mind if this is added to the end, created as a new dataframe or the data is converted.
Thanks!
Here is one approach, note that I do not use the mean/sd provided in the data but re-calculate it on the fly.
Also note that usually the data should be in a tidy data representation, which in your case would mean that a, b, c would be in columns and then mean/sd would be either calculated on the fly or be in a separate column (note that this would reshaping the data, not shown here).
# your input data
raw_data <- data.frame(
a = c(4, 3, 2, 3, 1),
b = c(3, 2, 1, 2, 1),
c = c(4, 6, 8, 6, 2),
row.names = c("a", "b", "c", "d", "e")
)
raw_data
#> a b c
#> a 4 3 4
#> b 3 2 6
#> c 2 1 8
#> d 3 2 6
#> e 1 1 2
# remove the mean/sd values
data <- raw_data[!rownames(raw_data) %in% c("d", "e"), ]
data
#> a b c
#> a 4 3 4
#> b 3 2 6
#> c 2 1 8
# quick way to recalculate the values
means <- apply(data, 2, mean)
means
#> a b c
#> 3 2 6
sds <- apply(data, 2, sd)
sds
#> a b c
#> 1 1 2
z_scores <- apply(data, 2, function(x) (x - mean(x)) / sd(x))
z_scores
#> a b c
#> a 1 1 -1
#> b 0 0 0
#> c -1 -1 1
Created on 2021-01-07 by the reprex package (v0.3.0)
Edit / Full Code
The following code is a bit longer but most of it is spent on getting the data into the right (long/tidy) format.
If you have any questions, feel free to use the comments.
Note that the tidyverse is really helpful, but might need some time to get used to. The code used here is mostly dplyr (included in the tidyverse).
If you understand the functions: %>% (pipe), group_by(), mutate(), summarise(), and pivot_longer/wider() you got everything.
library(tidyverse)
# use your original dataset again
raw_data <- data.frame(
a = c(4, 3, 2, 3, 1),
b = c(3, 2, 1, 2, 1),
c = c(4, 6, 8, 6, 2),
row.names = c("a", "b", "c", "d", "e")
)
### 1) Turn the data into a nicer format
# match-table how to rename the variables
var_match <- c(d = "mean", e = "sd")
# convert the raw data into a nicer format, first we do some minor changes
# (variable names, etc)
data_mixed <- raw_data %>%
# have the rownames as explicit variable
rownames_to_column("metric") %>%
# nicer printing etc
as_tibble() %>%
# replace variable names with mean/sd
mutate(metric = ifelse(metric %in% c("d", "e"),
var_match[metric], metric))
data_mixed
#> # A tibble: 5 x 4
#> metric a b c
#> <chr> <dbl> <dbl> <dbl>
#> 1 a 4 3 4
#> 2 b 3 2 6
#> 3 c 2 1 8
#> 4 mean 3 2 6
#> 5 sd 1 1 2
# separate the dataset into two:
# data holds the values
# data_vars holds the metrics mean and sd
data <- data_mixed %>% filter(!metric %in% var_match) %>% select(-metric)
data_vars <- data_mixed %>% filter(metric %in% var_match)
data
#> # A tibble: 3 x 3
#> a b c
#> <dbl> <dbl> <dbl>
#> 1 4 3 4
#> 2 3 2 6
#> 3 2 1 8
data_vars
#> # A tibble: 2 x 4
#> metric a b c
#> <chr> <dbl> <dbl> <dbl>
#> 1 mean 3 2 6
#> 2 sd 1 1 2
# turn the value dataset into its longer form, makes it easier to work with it later
data_long <- data %>%
pivot_longer(everything(), names_to = "var", values_to = "val")
data_long
#> # A tibble: 9 x 2
#> var val
#> <chr> <dbl>
#> 1 a 4
#> 2 b 3
#> 3 c 4
#> 4 a 3
#> 5 b 2
#> 6 c 6
#> 7 a 2
#> 8 b 1
#> 9 c 8
# turn the metric dataset into another long form, allowing easy combination in the next step
data_vars2 <- data_vars %>%
pivot_longer(-metric, names_to = "var", values_to = "val") %>%
pivot_wider(var, names_from = metric, values_from = val)
data_vars2
#> # A tibble: 3 x 3
#> var mean sd
#> <chr> <dbl> <dbl>
#> 1 a 3 1
#> 2 b 2 1
#> 3 c 6 2
# combine the datasets
data_all <- left_join(data_long, data_vars2, by = "var")
data_all
#> # A tibble: 9 x 4
#> var val mean sd
#> <chr> <dbl> <dbl> <dbl>
#> 1 a 4 3 1
#> 2 b 3 2 1
#> 3 c 4 6 2
#> 4 a 3 3 1
#> 5 b 2 2 1
#> 6 c 6 6 2
#> 7 a 2 3 1
#> 8 b 1 2 1
#> 9 c 8 6 2
## 2) calculate the z-score
# now comes the actual number crunchin!
# per variable var (a, b, c) compute the variable val_z as the z-score
data_res <- data_all %>%
group_by(var) %>%
mutate(val_z = (val - mean) / sd)
data_res
#> # A tibble: 9 x 5
#> # Groups: var [3]
#> var val mean sd val_z
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 a 4 3 1 1
#> 2 b 3 2 1 1
#> 3 c 4 6 2 -1
#> 4 a 3 3 1 0
#> 5 b 2 2 1 0
#> 6 c 6 6 2 0
#> 7 a 2 3 1 -1
#> 8 b 1 2 1 -1
#> 9 c 8 6 2 1
## 3) make the results more readable
# lastly pivot the results to its original form
data_res_wide <- data_res %>%
select(var, val_z) %>%
group_by(var) %>%
mutate(id = 1:n()) %>% # needed for easier identification of values
pivot_wider(id, names_from = var, values_from = val_z)
data_res_wide
#> # A tibble: 3 x 4
#> id a b c
#> <int> <dbl> <dbl> <dbl>
#> 1 1 1 1 -1
#> 2 2 0 0 0
#> 3 3 -1 -1 1
Created on 2021-01-07 by the reprex package (v0.3.0)
I tried to transform df into df2. I have done it through a very patchy way using df3, Is there a simpler and more elegant way of doing it?
library(tidyverse)
# I want to transform df
df <- tibble(id = c(1, 2, 1, 2, 1, 2),
time = c('t1', 't1', 't2', 't2', 't3', 't3'),
value = c(2, 3, 6, 4, 5, 7))
df
#> # A tibble: 6 x 3
#> id time value
#> <dbl> <chr> <dbl>
#> 1 1 t1 2
#> 2 2 t1 3
#> 3 1 t2 6
#> 4 2 t2 4
#> 5 1 t3 5
#> 6 2 t3 7
# into df2
df2 <- tibble(id = c(1, 2, 1, 2),
t = c(2, 3, 6, 4),
r = c(6, 4, 5, 7))
df2
#> # A tibble: 4 x 3
#> id t r
#> <dbl> <dbl> <dbl>
#> 1 1 2 6
#> 2 2 3 4
#> 3 1 6 5
#> 4 2 4 7
# This is how I did it, but I think it should be a better way
df3 <- df %>% pivot_wider(names_from = time, values_from = value)
b <- tibble(id = numeric(), t = numeric(), r = numeric())
for (i in 2:3){
a <- df3[,c(1,i,i+1)]
colnames(a) <- c('id', 't', 'r')
b <- bind_rows(a, b)
}
b
#> # A tibble: 4 x 3
#> id t r
#> <dbl> <dbl> <dbl>
#> 1 1 6 5
#> 2 2 4 7
#> 3 1 2 6
#> 4 2 3 4
Created on 2020-11-25 by the reprex package (v0.3.0)
For each id you can use lead to select next value and create r column and drop NA rows.
library(dplyr)
df %>%
group_by(id) %>%
mutate(t = value,
r = lead(value)) %>%
na.omit() %>%
select(id, t, r)
# id t r
# <dbl> <dbl> <dbl>
#1 1 2 6
#2 2 3 4
#3 1 6 5
#4 2 4 7
We can use summarise from dplyr version >= 1.0. Previously, it had the constraint of returning only single observation per group. From version >= 1.0, it is no longer the case. Can return any number of rows i.e. it can be shorter or longer than the original number of rows
library(dplyr)
df %>%
group_by(id) %>%
summarise(t = value[-n()], r = value[-1], .groups = 'drop')
-output
# A tibble: 4 x 3
# id t r
# <dbl> <dbl> <dbl>
#1 1 2 6
#2 1 6 5
#3 2 3 4
#4 2 4 7
I have created a dplyr function to evaluate counts of events for a population. The code works when used with explicit naming of variables within the dplyr::filter and dplyr::group_by functions.
I need to apply the function to 24 variables which are column headers within a data frame. Here they are referred to as x.
I have used !! as I understand that the variable is evaluated as a string rather than a column name.
The function
summary_table <- function(x){
assign(paste(x,"sum_tab", sep="_"),
envir = parent.frame(),
value = df %>%
filter(!is.na(!!x)) %>%
group_by(!!x) %>%
summarise(
'Variable name' = paste0(x),
Discharged = sum(admission_status == "Discharged"),
'Re-attended' = sum(!is.na(re_admission_status)),
'Admitted on Re-attendance' = sum(re_admission_status == "Admitted", na.rm = TRUE)))
}
I have used:
sapply(var_names, summary_table)
However this only outputs one row of the table for each variable in the list var_names
In summary I would like pointers to the correct mechanism to apply the function written above to a list of column names within the dplyr pipe.
Reproducible example
example <- mtcars %>%
group_by(vs) %>%
summarise(
'6 cylinder' = sum(cyl == 6),
'Large disp' = sum(disp >= 100),
'low gears' = sum(gear <= 4))
})
In this example we would want to apply this function to the following list
cars_var <- c("vm", "am", "carb")
This would produce three tables for each column in the list.
As #eipi10 commented, it is usually unwise to automatically create variables. A better idea is to create a single variable that is a list of data frames.
It is also easier to let users apply the groups themselves with group_by() or group_by_at(), so that you don't have to worry about how they provide the names of the variables.
EDIT 2019-05-2
One way is to regard the names of the grouping variables as the 'data', and map over them, creating a copy of the actual data grouped by each one of the grouping variables.
library(dplyr)
library(purrr)
grouping_vars <- c("vs", "am", "carb")
map(grouping_vars, group_by_at, .tbl = mtcars) %>%
map(summarise,
'6 cylinder' = sum(cyl == 6),
'Large disp' = sum(disp >= 100),
'low gears' = sum(gear <= 4))
#> [[1]]
#> # A tibble: 2 x 4
#> vs `6 cylinder` `Large disp` `low gears`
#> <dbl> <int> <int> <int>
#> 1 0 3 18 14
#> 2 1 4 9 13
#>
#> [[2]]
#> # A tibble: 2 x 4
#> am `6 cylinder` `Large disp` `low gears`
#> <dbl> <int> <int> <int>
#> 1 0 4 19 19
#> 2 1 3 8 8
#>
#> [[3]]
#> # A tibble: 6 x 4
#> carb `6 cylinder` `Large disp` `low gears`
#> <dbl> <int> <int> <int>
#> 1 1 2 4 7
#> 2 2 0 8 8
#> 3 3 0 3 3
#> 4 4 4 10 9
#> 5 6 1 1 0
#> 6 8 0 1 0
Created on 2019-05-02 by the reprex package (v0.2.1)
Original answer
Here is a function that uses dplyr::groups() to find out which variables have been grouped. Then it iterates over each grouping variable, summarises, and appends the resulting data frame to a list.
library(dplyr)
margins <- function(.data, ...) {
groups <- dplyr::groups(.data)
n <- length(groups)
out <- vector(mode = "list", length = n)
for (i in rev(seq_len(n))) {
out[[i]] <-
.data %>%
dplyr::group_by(!!groups[[i]]) %>%
dplyr::summarise(...) %>%
dplyr::group_by(!!groups[[i]]) # Reapply the original group
}
out
}
mtcars %>%
group_by(vs, am, carb) %>%
margins('6 cylinder' = sum(cyl == 6),
'Large disp' = sum(disp >= 100),
'low gears' = sum(gear <= 4))
#> [[1]]
#> # A tibble: 2 x 4
#> # Groups: vs [2]
#> vs `6 cylinder` `Large disp` `low gears`
#> <dbl> <int> <int> <int>
#> 1 0 3 18 14
#> 2 1 4 9 13
#>
#> [[2]]
#> # A tibble: 2 x 4
#> # Groups: am [2]
#> am `6 cylinder` `Large disp` `low gears`
#> <dbl> <int> <int> <int>
#> 1 0 4 19 19
#> 2 1 3 8 8
#>
#> [[3]]
#> # A tibble: 6 x 4
#> # Groups: carb [6]
#> carb `6 cylinder` `Large disp` `low gears`
#> <dbl> <int> <int> <int>
#> 1 1 2 4 7
#> 2 2 0 8 8
#> 3 3 0 3 3
#> 4 4 4 10 9
#> 5 6 1 1 0
#> 6 8 0 1 0
Created on 2019-04-24 by the reprex package (v0.2.1.9000)
If you want to group with a vector of variable names, you can use dplyr::group_by_at() and dplyr::vars().
cars_var <- c("vs", "am", "carb")
mtcars %>%
group_by_at(vars(cars_var)) %>%
margins('6 cylinder' = sum(cyl == 6),
'Large disp' = sum(disp >= 100),
'low gears' = sum(gear <= 4))
I am the author of a small package called armgin that implements this and a few similar ideas.