I am trying to replace all values with NAs in a vector (by group) if there
are at least two values that are greater than 4 where x is between 2 and
3.
In this example, in group a, there are 2 values greater than 4 for 2 <= x <= 3.
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
tibble(
grp = c("a", "a", "a", "b", "b", "b"),
x = c(1, 2, 3, 1, 2, 3),
val = c(4, 5, 6, 1, 2, 1)
) %>%
group_by(grp) %>%
mutate(val2 = ifelse(sum(val[between(x, 2, 3)] > 4) >= 2, NA, val))
#> # A tibble: 6 × 4
#> # Groups: grp [2]
#> grp x val val2
#> <chr> <dbl> <dbl> <dbl>
#> 1 a 1 4 NA
#> 2 a 2 5 NA
#> 3 a 3 6 NA
#> 4 b 1 1 1
#> 5 b 2 2 1
#> 6 b 3 1 1
Expected output
tibble(
grp = c("a", "a", "a", "b", "b", "b"),
x = c(1, 2, 3, 1, 2, 3),
val = c(4, 5, 6, 1, 2, 1),
val2 = c(NA, NA, NA, 1, 2, 1)
)
#> # A tibble: 6 × 4
#> grp x val val2
#> <chr> <dbl> <dbl> <dbl>
#> 1 a 1 4 NA
#> 2 a 2 5 NA
#> 3 a 3 6 NA
#> 4 b 1 1 1
#> 5 b 2 2 2
#> 6 b 3 1 1
Created on 2021-10-25 by the reprex package (v2.0.1)
The problem is that ifelse return a vector with length equal to the first parameter. Since sum(val[between(x, 2, 3)] > 4) >= 2 returns a logical vector of length 1, only the first val is returned and then it's recycled to the full length. For example ifelse(TRUE, 1:3, 11:13) will only return 1. You could either use rep to repeat that value for the full length
mutate(val2 = ifelse(rep(sum(val[between(x, 2, 3)] > 4) >= 2, n()), NA, val))
or use a standard if/else statement
mutate(val2 = if(sum(val[between(x, 2, 3)] > 4) >= 2) NA else val)
Related
Could someone help me with this transformation in R? I would like to transform
this table
ID
Condition
Count
1
A
1
1
B
0
2
A
1
2
B
1
3
A
0
3
B
1
4
A
1
4
B
1
5
A
1
5
B
1
6
A
1
6
B
0
7
A
0
7
B
1
8
A
0
9
B
0
into this table
To create a table of like-against like
A
B
Count of ID
1
0
2
0
0
1
1
1
3
0
1
2
Any help would be appreciated. Thank you.
Phil,
You can do:
with(dat, split(Count, Condition)) |>
table() |>
data.frame()
A B Freq
1 0 0 1
2 1 0 2
3 0 1 2
4 1 1 3
Data:
dat <- structure(list(ID = c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7,
7, 8, 9), Condition = c("A", "B", "A", "B", "A", "B", "A", "B",
"A", "B", "A", "B", "A", "B", "A", "B"), Count = c(1, 0, 1, 1,
0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0)), class = "data.frame", row.names = c(NA,
-16L))
Here is a tidyverse solution. I filled missing values with 0, please note that this leads to a different count than in your table (do you mean to have 8, 8 as the last two IDs and not 8, 9?):
data <- read.table(text = "ID Condition Count
1 A 1
1 B 0
2 A 1
2 B 1
3 A 0
3 B 1
4 A 1
4 B 1
5 A 1
5 B 1
6 A 1
6 B 0
7 A 0
7 B 1
8 A 0
9 B 0", header = TRUE)
library(tidyr)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
data %>%
pivot_wider(
id_cols = ID,
names_from = Condition,
values_from = Count,
values_fill = 0
) %>%
count(A, B, name = "Count of ID")
#> # A tibble: 4 × 3
#> A B `Count of ID`
#> <int> <int> <int>
#> 1 0 0 2
#> 2 0 1 2
#> 3 1 0 2
#> 4 1 1 3
Created on 2023-01-20 by the reprex package (v1.0.0)
I have the following grouped data frame:
library(dplyr)
# Create a sample dataframe
df <- data.frame(
student = c("A", "A", "A","B","B", "B", "C", "C","C"),
grade = c(1, 2, 3, 1, 2, 3, 1, 2, 3),
age= c(NA, 6, 6, 7, 7, 7, NA, NA, 9)
)
I want to update the age of each student so that it is one plus the age in the previous year, with their age in the first year they appear in the dataset remaining unchanged. For example, student A's age should be NA, 6, 7, student B's age should be 7,8,9, and student C's age should be NA, NA, 9.
How about this:
library(dplyr)
df <- data.frame(
student = c("A", "A", "A","B","B", "B", "C", "C","C"),
grade = c(1, 2, 3, 1, 2, 3, 1, 2, 3),
age= c(NA, 6, 6, 7, 7, 7, NA, NA, 9)
)
df %>%
group_by(student) %>%
mutate(age = age + cumsum(!is.na(age))-1)
#> # A tibble: 9 × 3
#> # Groups: student [3]
#> student grade age
#> <chr> <dbl> <dbl>
#> 1 A 1 NA
#> 2 A 2 6
#> 3 A 3 7
#> 4 B 1 7
#> 5 B 2 8
#> 6 B 3 9
#> 7 C 1 NA
#> 8 C 2 NA
#> 9 C 3 9
Created on 2022-12-30 by the reprex package (v2.0.1)
in data.table, assuming the order of the rows is the 'correct' order:
library(data.table)
setDT(df)[, new_age := age + rowid(age) - 1, by = .(student)]
# student grade age new_age
# 1: A 1 NA NA
# 2: A 2 6 6
# 3: A 3 6 7
# 4: B 1 7 7
# 5: B 2 7 8
# 6: B 3 7 9
# 7: C 1 NA NA
# 8: C 2 NA NA
# 9: C 3 9 9
I am following this tutorial about the tidyverse. It starts with a very simple example that I can reproduce:
library(readr)
menu <- read_csv("http://joeystanley.com/data/menu.csv")
#> Parsed with column specification:
#> cols(
#> Category = col_character(),
#> Item = col_character(),
#> Oz = col_double(),
#> Calories = col_double(),
#> Fat = col_double(),
#> Sugars = col_double()
#> )
But when I read a csv file from my own computer, I don't get the Parsed with column specification message. Why?
library(readr)
df1 <- data.frame(ID = c(1, 2, 3, 4, 5),
var1 = c('a', 'b', 'c', 'd', 'e'),
var2 = c(1, 1, 0, 0, 1))
write.csv(df1,"./df1.csv", row.names = TRUE)
read.csv(file = './df1.csv')
#> X ID var1 var2
#> 1 1 1 a 1
#> 2 2 2 b 1
#> 3 3 3 c 0
#> 4 4 4 d 0
#> 5 5 5 e 1
I have tried using options(readr.num_columns = 1) but nothing changes.
I am using read.csv instead of read_csv. With the latter, it works as expected:
library(readr)
df1 <- data.frame(ID = c(1, 2, 3, 4, 5),
var1 = c('a', 'b', 'c', 'd', 'e'),
var2 = c(1, 1, 0, 0, 1))
write.csv(df1,"./df1.csv", row.names = TRUE)
read_csv(file = './df1.csv')
#> Warning: Missing column names filled in: 'X1' [1]
#> Parsed with column specification:
#> cols(
#> X1 = col_double(),
#> ID = col_double(),
#> var1 = col_character(),
#> var2 = col_double()
#> )
#> # A tibble: 5 x 4
#> X1 ID var1 var2
#> <dbl> <dbl> <chr> <dbl>
#> 1 1 1 a 1
#> 2 2 2 b 1
#> 3 3 3 c 0
#> 4 4 4 d 0
#> 5 5 5 e 1
Created on 2021-04-05 by the reprex package (v0.3.0)
I have data as follows:
DT <- structure(list(Area = c("A", "A", "A", "A", "B", "B", "B", "B"
), Year = c(1, 1, 2, 2, 1, 1, 2, 2), Group = c(1, 2, 1, 2, 1,
2, 1, 2), Population_Count = c(10, 12, 10, 12, 10, 13, 10, 11
), Male_Count = c(5, 7, 5, 4, 5, 8, 5, 6), Female_Count = c(5,
5, 5, 8, 5, 5, 5, 5)), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"))
# A tibble: 8 x 6
Area Year Group Population_Count Male_Count Female_Count
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 1 1 10 5 5
2 A 1 2 12 7 5
3 A 2 1 10 5 5
4 A 2 2 12 4 8
5 B 1 1 10 5 5
6 B 1 2 13 8 5
7 B 2 1 10 5 5
8 B 2 2 11 6 5
I would like to keep one observations per Area-Year, without losing any information. I tried to do
DTcast <- dcast(DT, Area + Year ~ Group + Population_Count + Male_Count + Female_Count)
But that results in a lot of rubbish:
Area Year 1_10_5_5 2_11_6_5 2_12_4_8 2_12_7_5 2_13_8_5
1 A 1 5 NA NA 5 NA
2 A 2 5 NA 8 NA NA
3 B 1 5 NA NA NA 5
4 B 2 5 5 NA NA NA
In addition, when I apply it to the actual data, I get:
Using 'H_FEMALE' as value column. Use 'value.var' to override
Error in CJ(1:72284, 1:1333365) :
Cross product of elements provided to CJ() would result in 96380955660 rows which exceeds .Machine$integer.max == 2147483647
So I think I am doing something wrong. I think it maybe has to do with the value.var which I do not know how to select.
Desired result:
# A tibble: 4 x 9
Area Year Group `Population_Count_ Group_1` `Male_Count_ Group_1` `Female_Count_ Group_1` `Population_Count_ Group_2` `Male_Count_ Group_2` `Female_Count_ Group_2`
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 1 1 10 5 5 12 7 5
2 A 2 1 10 5 5 12 4 8
3 B 1 1 10 5 5 13 8 5
4 B 2 1 10 5 5 11 6 5
library(tidyverse)
DT %>% pivot_wider(id_cols = c("Area", "Year"), names_from = "Group", values_from = 4:6)
> DT %>% pivot_wider(id_cols = c("Area", "Year"), names_from = "Group", values_from = 4:6)
# A tibble: 4 x 8
Area Year Population_Count_1 Population_Count_2 Male_Count_1 Male_Count_2 Female_Count_1 Female_Count_2
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 1 10 12 5 7 5 5
2 A 2 10 12 5 4 5 8
3 B 1 10 13 5 8 5 5
4 B 2 10 11 5 6 5 5
This will name your columns as desired
DT %>% pivot_wider(id_cols = c("Area", "Year"),
names_from = "Group",
values_from = 4:6,
names_sep = "_Group_")
use data.table
library(data.table)
dt <- structure(list(Area = c("A", "A", "A", "A", "B", "B", "B", "B"
), Year = c(1, 1, 2, 2, 1, 1, 2, 2), Group = c(1, 2, 1, 2, 1,
2, 1, 2), Population_Count = c(10, 12, 10, 12, 10, 13, 10, 11
), Male_Count = c(5, 7, 5, 4, 5, 8, 5, 6), Female_Count = c(5,
5, 5, 8, 5, 5, 5, 5)), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"))
setDT(dt)
dcast(
dt,
formula = Area + Year ~ Group,
value.var = grep("_Count", names(dt), value = T)
)
#> Area Year Population_Count_1 Population_Count_2 Male_Count_1 Male_Count_2
#> 1: A 1 10 12 5 7
#> 2: A 2 10 12 5 4
#> 3: B 1 10 13 5 8
#> 4: B 2 10 11 5 6
#> Female_Count_1 Female_Count_2
#> 1: 5 5
#> 2: 5 8
#> 3: 5 5
#> 4: 5 5
Created on 2020-12-18 by the reprex package (v0.3.0)
I would like to lag several columns (e.g. value_1 + value_2 + x - see below), define their number of lags (e.g. 3) and their naming. This is some working tedious/manual code:
library(dplyr)
library(lubridate)
library(data.table)
haves <- data.frame(
id = c("a", "a", "a", "a", "a", "a", "b", "b", "b", "b", "b", "b")
, date = as.Date(c("2015-01-01", "2015-02-01", "2015-03-01", "2015-04-01", "2015-05-01", "2015-06-01", "2015-01-01", "2015-02-01", "2015-03-01", "2015-04-01", "2015-05-01", "2015-06-01"))
, value_1 = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)
, value_2 = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)
, x = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)
)
haves$value_2 <- haves$value_2 + 1
haves$x <- haves$x + 2
haves
wants <- haves %>%
group_by(id) %>%
mutate(
value_1_lag_1 = lag(value_1, n = 1, order_by = date)
, value_1_lag_2 = lag(value_1, n = 2, order_by = date)
, value_1_lag_3 = lag(value_1, n = 3, order_by = date)
, value_2_lag_1 = lag(value_2, n = 1, order_by = date)
, value_2_lag_2 = lag(value_2, n = 2, order_by = date)
, value_2_lag_3 = lag(value_2, n = 3, order_by = date)
, x_lag_1 = lag(x, n = 1, order_by = date)
, x_lag_2 = lag(x, n = 2, order_by = date)
, x_lag_3 = lag(x, n = 3, order_by = date)
)
wants
Someone suggested that this offers a solution, so I tried the data table approach without success:
setDT(haves)
haves[, sapply(1:3, function(x){paste0('', x, '_lag_', 1:3)}) := shift(.SD, 1:3),
by = id, .SDcols = value_1:x][]
It does not produce my wants. This comes closer:
colnames <- colnames(haves)
setDT(haves)
haves[, sapply(1:3, function(x){paste0(colnames[x + 2], x, '_lag_', 1:3)}) := shift(.SD, 1:3), by = id, .SDcols = value_1:x][]
Alternatively, I may just use a loop and a function like this:
appender <- function(df, column, lag){
df %>%
group_by(
id
) %>%
mutate(
!!paste0(column, "_lag_", lag) := lag(!!rlang::sym(column), n = lag, order_by = date)
)
}
temp <- appender(haves, "value_2", 3)
Any help would be very much appreciated. Thanks!
Here's the adapted dplyr solution from the linked answer.
haves %>%
group_by(id) %>%
nest %>%
mutate(data = map(data, ~arrange(., date))) %>%
mutate(lags = map(data, function(dat) {
imap_dfc(dat[-1], ~set_names(map(1:3, lag, x = .x),
paste0(.y, "_lag_", 1:3)))
})) %>%
unnest(c(data, lags))
Is this what you're looking for?
Here is an alternative way through data.table.
library(data.table)
library(lubridate)
#>
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:data.table':
#>
#> hour, isoweek, mday, minute, month, quarter, second, wday, week,
#> yday, year
#> The following object is masked from 'package:base':
#>
#> date
library(stringr)
haves <- data.frame(
id = c("a", "a", "a", "a", "a", "a", "b", "b", "b", "b", "b", "b")
, date = as.Date(c("2015-01-01", "2015-02-01", "2015-03-01", "2015-04-01", "2015-05-01", "2015-06-01", "2015-01-01", "2015-02-01", "2015-03-01", "2015-04-01", "2015-05-01", "2015-06-01"))
, value_1 = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)
, value_2 = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)
, value_3 = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)
)
setDT(haves)
setorder(haves, date)
N_vars = 3
N_lags = 3
current_vars = paste0( "value_", rep(1:N_vars, each = 1) )
new_vars = paste0( "value_", rep(1:N_vars, each = N_lags), "_lag_", 1:N_lags )
to_define = new_vars[ str_detect(new_vars, "lag_1") ]
haves[ , (new_vars) := shift( .SD, 1:N_lags ), .SDcols = current_vars]
haves
#> id date value_1 value_2 value_3 value_1_lag_1 value_1_lag_2
#> 1: a 2015-01-01 1 1 1 NA NA
#> 2: b 2015-01-01 7 7 7 1 NA
#> 3: a 2015-02-01 2 2 2 7 1
#> 4: b 2015-02-01 8 8 8 2 7
#> 5: a 2015-03-01 3 3 3 8 2
#> 6: b 2015-03-01 9 9 9 3 8
#> 7: a 2015-04-01 4 4 4 9 3
#> 8: b 2015-04-01 10 10 10 4 9
#> 9: a 2015-05-01 5 5 5 10 4
#> 10: b 2015-05-01 11 11 11 5 10
#> 11: a 2015-06-01 6 6 6 11 5
#> 12: b 2015-06-01 12 12 12 6 11
#> value_1_lag_3 value_2_lag_1 value_2_lag_2 value_2_lag_3 value_3_lag_1
#> 1: NA NA NA NA NA
#> 2: NA 1 NA NA 1
#> 3: NA 7 1 NA 7
#> 4: 1 2 7 1 2
#> 5: 7 8 2 7 8
#> 6: 2 3 8 2 3
#> 7: 8 9 3 8 9
#> 8: 3 4 9 3 4
#> 9: 9 10 4 9 10
#> 10: 4 5 10 4 5
#> 11: 10 11 5 10 11
#> 12: 5 6 11 5 6
#> value_3_lag_2 value_3_lag_3
#> 1: NA NA
#> 2: NA NA
#> 3: 1 NA
#> 4: 7 1
#> 5: 2 7
#> 6: 8 2
#> 7: 3 8
#> 8: 9 3
#> 9: 4 9
#> 10: 10 4
#> 11: 5 10
#> 12: 11 5
Created on 2020-05-04 by the reprex package (v0.3.0)